This project is focused on enhancing user engagement and overall website performance for Tasty Bytes. The core challenge is leveraging our historical data to maximize the visibility of our most successful content.
Tasty Bytes aims to display recipes that attract high user engagement, which is the primary driver of our platform’s success.
By accurately selecting and promoting popular recipes, we have the potential to significantly increase overall website traffic by up to 40%.
Aim: Predict which recipes will be popular \(\mathbf{80\%}\) of the time and minimize showing unpopular ones.
To achieve the 80% accuracy aim, the project will focus on the following core objectives:
Predict high-traffic (popular) recipes.
Minimize the chance of showing unpopular recipes(False Positive).
Define and monitor a precision-based performance metric.
# Load the dataset
df <- read_csv("recipe_site_traffic_2212.csv")
## Rows: 947 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): recipe, category, servings, high_traffic
## dbl (4): calories, carbohydrate, sugar, protein
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display the first 5 rows
print("Initial Data Head:")
## [1] "Initial Data Head:"
kable(head(df))
| recipe | calories | carbohydrate | sugar | protein | category | servings | high_traffic |
|---|---|---|---|---|---|---|---|
| 001 | NA | NA | NA | NA | Pork | 6 | High |
| 002 | 35.48 | 38.56 | 0.66 | 0.92 | Potato | 4 | High |
| 003 | 914.28 | 42.68 | 3.09 | 2.88 | Breakfast | 1 | NA |
| 004 | 97.03 | 30.56 | 38.63 | 0.02 | Beverages | 4 | High |
| 005 | 27.05 | 1.85 | 0.80 | 0.53 | Beverages | 4 | NA |
| 006 | 691.15 | 3.46 | 1.65 | 53.93 | One Dish Meal | 2 | High |
n_distinct(df$servings)
## [1] 6
unique(df$servings)
## [1] "6" "4" "1" "2" "4 as a snack"
## [6] "6 as a snack"
# Check data structure
print("Data Structure (glimpse):")
## [1] "Data Structure (glimpse):"
glimpse(df)
## Rows: 947
## Columns: 8
## $ recipe <chr> "001", "002", "003", "004", "005", "006", "007", "008", "…
## $ calories <dbl> NA, 35.48, 914.28, 97.03, 27.05, 691.15, 183.94, 299.14, …
## $ carbohydrate <dbl> NA, 38.56, 42.68, 30.56, 1.85, 3.46, 47.95, 3.17, 3.78, 4…
## $ sugar <dbl> NA, 0.66, 3.09, 38.63, 0.80, 1.65, 9.75, 0.40, 3.37, 3.99…
## $ protein <dbl> NA, 0.92, 2.88, 0.02, 0.53, 53.93, 46.71, 32.40, 3.79, 11…
## $ category <chr> "Pork", "Potato", "Breakfast", "Beverages", "Beverages", …
## $ servings <chr> "6", "4", "1", "4", "4", "2", "4", "4", "6", "2", "1", "6…
## $ high_traffic <chr> "High", "High", NA, "High", NA, "High", NA, NA, "High", N…
# Check for missing values
print("Initial Missing Value Counts:")
## [1] "Initial Missing Value Counts:"
sapply(df, function(x) sum(is.na(x)))
## recipe calories carbohydrate sugar protein category
## 0 52 52 52 52 0
## servings high_traffic
## 0 373
# --- Data Cleaning Pipeline ---
df_clean <- df %>%
# Impute missing 'high_traffic' values with "Low"
mutate(high_traffic = replace_na(high_traffic, "Low")) %>%
# Drop rows with NA in 'calories' (also drops NAs in carbohydrate, sugar, protein)
drop_na(calories) %>%
# Clean and convert 'servings' column
mutate(
servings = str_replace(servings, "as a snack", ""), # Remove "as a snack"
servings = as.integer(str_trim(servings)) # Trim whitespace and convert to integer
) %>%
# Ensure 'protein' is numeric (as noted in markdown)
mutate(protein = as.numeric(protein))
# --- Validation Checks After Cleaning ---
# Check final NA counts
print("Missing Value Counts After Cleaning:")
## [1] "Missing Value Counts After Cleaning:"
sapply(df_clean, function(x) sum(is.na(x)))
## recipe calories carbohydrate sugar protein category
## 0 0 0 0 0 0
## servings high_traffic
## 0 0
# Check 'servings' column values
print("Cleaned 'servings' Counts:")
## [1] "Cleaned 'servings' Counts:"
print(table(df_clean$servings))
##
## 1 2 4 6
## 169 174 367 185
# Display summary statistics
print("Summary Statistics After Cleaning:")
## [1] "Summary Statistics After Cleaning:"
summary(df_clean)
## recipe calories carbohydrate sugar
## Length:895 Min. : 0.14 Min. : 0.030 Min. : 0.010
## Class :character 1st Qu.: 110.43 1st Qu.: 8.375 1st Qu.: 1.690
## Mode :character Median : 288.55 Median : 21.480 Median : 4.550
## Mean : 435.94 Mean : 35.070 Mean : 9.047
## 3rd Qu.: 597.65 3rd Qu.: 44.965 3rd Qu.: 9.800
## Max. :3633.16 Max. :530.420 Max. :148.750
## protein category servings high_traffic
## Min. : 0.000 Length:895 Min. :1.000 Length:895
## 1st Qu.: 3.195 Class :character 1st Qu.:2.000 Class :character
## Median : 10.800 Mode :character Median :4.000 Mode :character
## Mean : 24.149 Mean :3.458
## 3rd Qu.: 30.200 3rd Qu.:4.000
## Max. :363.360 Max. :6.000
# high_traffic (Target Variable) Pie Chart
# Calculate counts and percentages
traffic_counts <- df_clean %>%
group_by(high_traffic) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(percentage = count / sum(count))
print(traffic_counts)
## # A tibble: 2 × 3
## high_traffic count percentage
## <chr> <int> <dbl>
## 1 High 535 0.598
## 2 Low 360 0.402
# Plot pie chart
ggplot(traffic_counts, aes(x = "", y = percentage, fill = high_traffic)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
# Add labels (percentage and count)
geom_text(aes(label = paste0(scales::percent(percentage, accuracy = 0.01), "\n(", count, ")")),
position = position_stack(vjust = 0.5)) +
labs(title = "Recipe Traffic Distribution", fill = "Traffic") +
theme_void()
Finding: The cleaned dataset consists of 59.78% High-traffic recipes and 40.22% Low-traffic recipes.
Interpretation: The dataset is slightly imbalanced, with a majority of recipes in the dataset driving high traffic.
# category Column Bar Chart
ggplot(df_clean, aes(x = category, fill = category)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust = -0.5, position = position_stack(vjust = 1)) +
labs(title = "Number of Recipes per Category", x = "Category", y = "Count") +
theme_minimal() +
theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# --- Filter data to only Chicken categories ---
chicken_categories <- c("Chicken", "Chicken Breast")
chicken_data <- df_clean %>%
filter(category %in% chicken_categories)
# --- Calculate proportions ---
impact <- chicken_data %>%
# Count combinations of category and high_traffic
group_by(category, high_traffic) %>%
summarise(count = n(), .groups = 'drop') %>%
# Calculate proportion within each category
group_by(category) %>%
mutate(Proportion = count / sum(count))
# --- Print the impact table ---
impact_table_wide <- impact %>%
select(-count) %>% # Drop the raw count
pivot_wider(names_from = high_traffic, values_from = Proportion)
print("Proportion of High/Low Traffic for Chicken Categories:")
## [1] "Proportion of High/Low Traffic for Chicken Categories:"
kable(impact_table_wide, digits = 3)
| category | High | Low |
|---|---|---|
| Chicken | 0.362 | 0.638 |
| Chicken Breast | 0.468 | 0.532 |
# --- Plot the proportions ---
chicken_plot <- ggplot(impact, aes(x = category, y = Proportion, fill = high_traffic)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(
title = "High Traffic Proportion for Chicken Categories",
y = "Proportion",
x = "Category",
fill = "High Traffic"
) +
theme_minimal()
# Save the plot ---
ggsave("High_Traffic_Proportion_for_Chicken_Categories.png", plot = chicken_plot)
# --- Display the plot ---
print(chicken_plot)
# Nutritional Columns Boxplots
# We "pivot" the data to a long format for easy plotting with ggplot2
df_long <- df_clean %>%
select(calories, carbohydrate, sugar, protein, high_traffic) %>%
pivot_longer(
cols = -high_traffic,
names_to = "nutrient",
values_to = "value"
)
ggplot(df_long, aes(x = high_traffic, y = value, fill = high_traffic)) +
geom_boxplot() +
# Use facet_wrap to create separate plots for each nutrient
facet_wrap(~ nutrient, scales = "free_y") +
labs(title = "Nutritional Value Distribution by Traffic",
x = "High Traffic", y = "Value") +
theme_minimal()
Findings & Interpretation:
calories, carbohydrate, sugar: The high-traffic group had significantly more high-value outliers.
protein: The distributions for both groups were almost identical.
Interpretation: Exceptionally high-calorie, high-carbohydrate, and especially high-sugar “indulgent” recipes are significant drivers of high traffic. Protein content appears to have no impact.
# Correlation Analysis Heatmap
# Select only the numeric columns for correlation
numeric_df <- df_clean %>% select(calories, carbohydrate, sugar, protein, servings)
# Calculate correlation matrix
cor_matrix <- round(cor(numeric_df), 2)
# Melt the matrix for ggplot2
melted_cormat <- melt(cor_matrix)
# Plot the heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
# Add correlation values as text
geom_text(aes(label = value), color = "black", size = 4) +
# Define the color scale
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1)) +
coord_fixed() +
labs(title = "Correlation Heatmap of Numerical Features")
Finding: All correlations were very low (close to 0).
Interpretation: This is a positive result. It indicates no multicollinearity, meaning each feature provides unique and independent information for the model.
# --- Calculate Averages ---
# (Equivalent to Python's .groupby().mean().reset_index())
traffic_avg <- df_clean %>%
# Group by the two categorical variables
group_by(category, high_traffic) %>%
# Calculate the mean for all numeric columns of interest
summarise(
calories = mean(calories, na.rm = TRUE),
carbohydrate = mean(carbohydrate, na.rm = TRUE),
sugar = mean(sugar, na.rm = TRUE),
protein = mean(protein, na.rm = TRUE),
servings = mean(servings, na.rm = TRUE),
.groups = 'drop' # Ungroup after summarising
)
# --- Pivot to Long Format for Plotting ---
# This is the standard R/ggplot2 way to prepare data for faceted plots
traffic_avg_long <- traffic_avg %>%
pivot_longer(
cols = c(calories, carbohydrate, sugar, protein, servings),
names_to = "nutrient", # The name of the new column for feature names
values_to = "average" # The name of the new column for the mean values
)
# --- Create the Faceted Bar Plot ---
# (Equivalent to Python's loop over subplots)
ggplot(traffic_avg_long, aes(x = category, y = average, fill = high_traffic)) +
# Create grouped bar chart
geom_bar(stat = "identity", position = position_dodge()) +
# Add value labels on top of bars, dodging them to match
geom_text(
aes(label = round(average, 1)),
position = position_dodge(width = 0.9),
vjust = -0.3, # Adjust vertical position to be just above the bar
size = 3
) +
# Create separate plots for each nutrient, stacked vertically (1 column)
# scales = "free_y" allows each plot to have its own y-axis range
facet_wrap(~ nutrient, ncol = 1, scales = "free_y") +
labs(
title = "Average Nutritional Values by Category & Traffic",
x = "Category",
y = "Average Value",
fill = "Traffic Level"
) +
# Rotate x-axis labels for readability
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
category and
high_traffic to factors# --- Model Development ---
# Prepare data for modeling
model_df <- df_clean %>%
# Drop the recipe identifier
select(-recipe) %>%
# Convert categorical variables to factors
mutate(
category = as.factor(category),
high_traffic = as.factor(high_traffic)
)
# Set the "High" level as the positive class for evaluation
# This ensures 'Precision' and 'Recall' are calculated for "High" traffic
model_df$high_traffic <- relevel(model_df$high_traffic, ref = "High")
# --- Train-Test Split ---
set.seed(42) # for reproducibility
trainIndex <- createDataPartition(model_df$high_traffic, p = 0.8,
list = FALSE,
times = 1)
train_data <- model_df[trainIndex, ]
test_data <- model_df[-trainIndex, ]
# --- Model Development & Evaluation ---
# Set up 10-fold cross-validation
# We use twoClassSummary to get ROC, Sensitivity, and Specificity
train_control <- trainControl(method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE)
# Baseline Model: Logistic Regression (with scaling)
set.seed(42)
model_lr <- train(
high_traffic ~ .,
data = train_data,
method = "glm",
family = "binomial",
preProcess = c("center", "scale"), # Scales numeric features
metric = "ROC", # Tune based on ROC AUC
trControl = train_control
)
print(model_lr)
## Generalized Linear Model
##
## 716 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## Pre-processing: centered (15), scaled (15)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 646, 644, 644, 644, 645, 645, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8254922 0.7918605 0.7257389
# 2. Comparison Model: Random Forest
# (Equivalent to RandomForestClassifier)
# Note: RF doesn't require scaling, so we omit preProcess
set.seed(42)
model_rf <- train(
high_traffic ~ .,
data = train_data,
method = "rf",
metric = "ROC",
trControl = train_control,
tuneLength = 3 # Checks 3 'mtry' values; similar to default RF
)
print(model_rf)
## Random Forest
##
## 716 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 646, 644, 644, 644, 645, 645, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7904806 0.8082503 0.6597291
## 8 0.7905525 0.7849391 0.6075123
## 15 0.7875230 0.7964563 0.6142857
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 8.
# --- Model Evaluation ---
# Get predictions for the test set
preds_lr <- predict(model_lr, newdata = test_data)
preds_rf <- predict(model_rf, newdata = test_data)
# Generate Confusion Matrices
cm_lr <- confusionMatrix(preds_lr, test_data$high_traffic, positive = "High")
cm_rf <- confusionMatrix(preds_rf, test_data$high_traffic, positive = "High")
# Print Logistic Regression Confusion Matrix
print("--- Logistic Regression (Baseline) ---")
## [1] "--- Logistic Regression (Baseline) ---"
print(cm_lr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 88 23
## Low 19 49
##
## Accuracy : 0.7654
## 95% CI : (0.6964, 0.8254)
## No Information Rate : 0.5978
## P-Value [Acc > NIR] : 1.683e-06
##
## Kappa : 0.5076
##
## Mcnemar's Test P-Value : 0.6434
##
## Sensitivity : 0.8224
## Specificity : 0.6806
## Pos Pred Value : 0.7928
## Neg Pred Value : 0.7206
## Prevalence : 0.5978
## Detection Rate : 0.4916
## Detection Prevalence : 0.6201
## Balanced Accuracy : 0.7515
##
## 'Positive' Class : High
##
# Print Random Forest Confusion Matrix
print("--- Random Forest (Comparison) ---")
## [1] "--- Random Forest (Comparison) ---"
print(cm_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 87 27
## Low 20 45
##
## Accuracy : 0.7374
## 95% CI : (0.6666, 0.8003)
## No Information Rate : 0.5978
## P-Value [Acc > NIR] : 6.447e-05
##
## Kappa : 0.4452
##
## Mcnemar's Test P-Value : 0.3815
##
## Sensitivity : 0.8131
## Specificity : 0.6250
## Pos Pred Value : 0.7632
## Neg Pred Value : 0.6923
## Prevalence : 0.5978
## Detection Rate : 0.4860
## Detection Prevalence : 0.6369
## Balanced Accuracy : 0.7190
##
## 'Positive' Class : High
##
# --- Create Comparison Table ---
stats_lr <- cm_lr$byClass
stats_rf <- cm_rf$byClass
results_table <- data.frame(
Metric = c("Accuracy", "Precision (High)", "Recall (High)", "F1-Score (High)"),
Logistic_Regression = c(
cm_lr$overall['Accuracy'],
stats_lr['Precision'],
stats_lr['Recall'],
stats_lr['F1']
),
Random_Forest = c(
cm_rf$overall['Accuracy'],
stats_rf['Precision'],
stats_rf['Recall'],
stats_rf['F1']
)
)
# Print the table using kable() for nice markdown formatting
kable(results_table, digits = 2, caption = "Model Performance Comparison")
| Metric | Logistic_Regression | Random_Forest | |
|---|---|---|---|
| Accuracy | Accuracy | 0.77 | 0.74 |
| Precision | Precision (High) | 0.79 | 0.76 |
| Recall | Recall (High) | 0.82 | 0.81 |
| F1 | F1-Score (High) | 0.81 | 0.79 |
# Load the gridExtra library to arrange plots side-by-side
# (You might need to install this: install.packages("gridExtra"))
library(gridExtra)
# Prepare data for Logistic Regression Plot ---
# Convert the table to a data.frame for ggplot
lr_table <- as.data.frame(cm_lr$table)
# Create Logistic Regression Plot (p_lr) ---
p_lr <- ggplot(lr_table, aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile(color = "black") + # Add black borders to tiles
geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
# Use a blue color scale, from light to dark
scale_fill_distiller(palette = "Blues", direction = 1) +
labs(
title = "Baseline (Logistic Regression) Confusion Matrix",
x = "Predicted Label",
y = "True Label"
) +
theme_minimal() +
theme(legend.position = "none") # Hide the fill legend
# Prepare data for Random Forest Plot ---
# Convert the table to a data.frame for ggplot
rf_table <- as.data.frame(cm_rf$table)
# Create Random Forest Plot (p_rf) ---
p_rf <- ggplot(rf_table, aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
# Use a green color scale, from light to dark
scale_fill_distiller(palette = "Greens", direction = 1) +
labs(
title = "Comparison (Random Forest) Confusion Matrix",
x = "Predicted Label",
y = "True Label"
) +
theme_minimal() +
theme(legend.position = "none")
# Arrange plots side-by-side ---
combined_plots <- grid.arrange(p_lr, p_rf, ncol = 2)
# --- Save the combined plot to a file ---
# ggsave can save objects created by grid.arrange
ggsave(
"Confusion_Matrices_comparing_both_models.png",
plot = combined_plots,
width = 12,
height = 5,
dpi = 300
)
# Display the combined plot in the report
print(combined_plots)
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]