# Load the dataset
df <- read_csv("recipe_site_traffic_2212.csv")
## Rows: 947 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): recipe, category, servings, high_traffic
## dbl (4): calories, carbohydrate, sugar, protein
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display the first 5 rows
print("Initial Data Head:")
## [1] "Initial Data Head:"
kable(head(df))
| recipe | calories | carbohydrate | sugar | protein | category | servings | high_traffic |
|---|---|---|---|---|---|---|---|
| 001 | NA | NA | NA | NA | Pork | 6 | High |
| 002 | 35.48 | 38.56 | 0.66 | 0.92 | Potato | 4 | High |
| 003 | 914.28 | 42.68 | 3.09 | 2.88 | Breakfast | 1 | NA |
| 004 | 97.03 | 30.56 | 38.63 | 0.02 | Beverages | 4 | High |
| 005 | 27.05 | 1.85 | 0.80 | 0.53 | Beverages | 4 | NA |
| 006 | 691.15 | 3.46 | 1.65 | 53.93 | One Dish Meal | 2 | High |
n_distinct(df$servings)
## [1] 6
unique(df$servings)
## [1] "6" "4" "1" "2" "4 as a snack"
## [6] "6 as a snack"
# Check data structure
print("Data Structure (glimpse):")
## [1] "Data Structure (glimpse):"
glimpse(df)
## Rows: 947
## Columns: 8
## $ recipe <chr> "001", "002", "003", "004", "005", "006", "007", "008", "…
## $ calories <dbl> NA, 35.48, 914.28, 97.03, 27.05, 691.15, 183.94, 299.14, …
## $ carbohydrate <dbl> NA, 38.56, 42.68, 30.56, 1.85, 3.46, 47.95, 3.17, 3.78, 4…
## $ sugar <dbl> NA, 0.66, 3.09, 38.63, 0.80, 1.65, 9.75, 0.40, 3.37, 3.99…
## $ protein <dbl> NA, 0.92, 2.88, 0.02, 0.53, 53.93, 46.71, 32.40, 3.79, 11…
## $ category <chr> "Pork", "Potato", "Breakfast", "Beverages", "Beverages", …
## $ servings <chr> "6", "4", "1", "4", "4", "2", "4", "4", "6", "2", "1", "6…
## $ high_traffic <chr> "High", "High", NA, "High", NA, "High", NA, NA, "High", N…
# Check for missing values
print("Initial Missing Value Counts:")
## [1] "Initial Missing Value Counts:"
sapply(df, function(x) sum(is.na(x)))
## recipe calories carbohydrate sugar protein category
## 0 52 52 52 52 0
## servings high_traffic
## 0 373
# --- Data Cleaning Pipeline ---
df_clean <- df %>%
# Impute missing 'high_traffic' values with "Low"
mutate(high_traffic = replace_na(high_traffic, "Low")) %>%
# Drop rows with NA in 'calories' (also drops NAs in carbohydrate, sugar, protein)
drop_na(calories) %>%
# Clean and convert 'servings' column
mutate(
servings = str_replace(servings, "as a snack", ""), # Remove "as a snack"
servings = as.integer(str_trim(servings)) # Trim whitespace and convert to integer
) %>%
# Ensure 'protein' is numeric (as noted in markdown)
mutate(protein = as.numeric(protein))
# --- Validation Checks After Cleaning ---
# Check final NA counts
print("Missing Value Counts After Cleaning:")
## [1] "Missing Value Counts After Cleaning:"
sapply(df_clean, function(x) sum(is.na(x)))
## recipe calories carbohydrate sugar protein category
## 0 0 0 0 0 0
## servings high_traffic
## 0 0
# Check 'servings' column values
print("Cleaned 'servings' Counts:")
## [1] "Cleaned 'servings' Counts:"
print(table(df_clean$servings))
##
## 1 2 4 6
## 169 174 367 185
# Display summary statistics
print("Summary Statistics After Cleaning:")
## [1] "Summary Statistics After Cleaning:"
summary(df_clean)
## recipe calories carbohydrate sugar
## Length:895 Min. : 0.14 Min. : 0.030 Min. : 0.010
## Class :character 1st Qu.: 110.43 1st Qu.: 8.375 1st Qu.: 1.690
## Mode :character Median : 288.55 Median : 21.480 Median : 4.550
## Mean : 435.94 Mean : 35.070 Mean : 9.047
## 3rd Qu.: 597.65 3rd Qu.: 44.965 3rd Qu.: 9.800
## Max. :3633.16 Max. :530.420 Max. :148.750
## protein category servings high_traffic
## Min. : 0.000 Length:895 Min. :1.000 Length:895
## 1st Qu.: 3.195 Class :character 1st Qu.:2.000 Class :character
## Median : 10.800 Mode :character Median :4.000 Mode :character
## Mean : 24.149 Mean :3.458
## 3rd Qu.: 30.200 3rd Qu.:4.000
## Max. :363.360 Max. :6.000
# high_traffic (Target Variable) Pie Chart
# Calculate counts and percentages
traffic_counts <- df_clean %>%
group_by(high_traffic) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(percentage = count / sum(count))
print(traffic_counts)
## # A tibble: 2 × 3
## high_traffic count percentage
## <chr> <int> <dbl>
## 1 High 535 0.598
## 2 Low 360 0.402
# Plot pie chart
ggplot(traffic_counts, aes(x = "", y = percentage, fill = high_traffic)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
# Add labels (percentage and count)
geom_text(aes(label = paste0(scales::percent(percentage, accuracy = 0.01), "\n(", count, ")")),
position = position_stack(vjust = 0.5)) +
labs(title = "Recipe Traffic Distribution", fill = "Traffic") +
theme_void()
Finding: The cleaned dataset consists of 59.78% High-traffic recipes and 40.22% Low-traffic recipes.
Interpretation: The dataset is slightly imbalanced, with a majority of recipes in the dataset driving high traffic.
# category Column Bar Chart
ggplot(df_clean, aes(y = category, fill = category)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), hjust = -0.1) +
labs(title = "Number of Recipes per Category", x = "Count", y = "Category") +
theme_minimal() +
theme(legend.position = "none")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# --- Filter data to only Chicken categories ---
chicken_categories <- c("Chicken", "Chicken Breast")
chicken_data <- df_clean %>%
filter(category %in% chicken_categories)
# --- Calculate proportions ---
impact <- chicken_data %>%
# Count combinations of category and high_traffic
group_by(category, high_traffic) %>%
summarise(count = n(), .groups = 'drop') %>%
# Calculate proportion within each category
group_by(category) %>%
mutate(Proportion = count / sum(count))
# --- Print the impact table ---
impact_table_wide <- impact %>%
select(-count) %>% # Drop the raw count
pivot_wider(names_from = high_traffic, values_from = Proportion)
print("Proportion of High/Low Traffic for Chicken Categories:")
## [1] "Proportion of High/Low Traffic for Chicken Categories:"
kable(impact_table_wide, digits = 3)
| category | High | Low |
|---|---|---|
| Chicken | 0.362 | 0.638 |
| Chicken Breast | 0.468 | 0.532 |
# --- Plot the proportions ---
chicken_plot <- ggplot(impact, aes(x = category, y = Proportion, fill = high_traffic)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(
title = "High Traffic Proportion for Chicken Categories",
y = "Proportion",
x = "Category",
fill = "High Traffic"
) +
theme_minimal()
# Save the plot ---
ggsave("High_Traffic_Proportion_for_Chicken_Categories.png", plot = chicken_plot)
# --- Display the plot ---
print(chicken_plot)
# Nutritional Columns Boxplots
# We "pivot" the data to a long format for easy plotting with ggplot2
df_long <- df_clean %>%
select(calories, carbohydrate, sugar, protein, high_traffic) %>%
pivot_longer(
cols = -high_traffic,
names_to = "nutrient",
values_to = "value"
)
ggplot(df_long, aes(x = high_traffic, y = value, fill = high_traffic)) +
geom_boxplot() +
# Use facet_wrap to create separate plots for each nutrient
facet_wrap(~ nutrient, scales = "free_y") +
labs(title = "Nutritional Value Distribution by Traffic",
x = "High Traffic", y = "Value") +
theme_minimal()
Findings & Interpretation:
calories, carbohydrate, sugar: The high-traffic group had significantly more high-value outliers.
protein: The distributions for both groups were almost identical.
Interpretation: Exceptionally high-calorie, high-carbohydrate, and especially high-sugar “indulgent” recipes are significant drivers of high traffic. Protein content appears to have no impact.
# Correlation Analysis Heatmap
# Select only the numeric columns for correlation
numeric_df <- df_clean %>% select(calories, carbohydrate, sugar, protein, servings)
# Calculate correlation matrix
cor_matrix <- round(cor(numeric_df), 2)
# Melt the matrix for ggplot2
melted_cormat <- melt(cor_matrix)
# Plot the heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
# Add correlation values as text
geom_text(aes(label = value), color = "black", size = 4) +
# Define the color scale
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1)) +
coord_fixed() +
labs(title = "Correlation Heatmap of Numerical Features")
Finding: All correlations were very low (close to 0).
Interpretation: This is a positive result. It indicates no multicollinearity, meaning each feature provides unique and independent information for the model.
# --- Calculate Averages ---
# (Equivalent to Python's .groupby().mean().reset_index())
traffic_avg <- df_clean %>%
# Group by the two categorical variables
group_by(category, high_traffic) %>%
# Calculate the mean for all numeric columns of interest
summarise(
calories = mean(calories, na.rm = TRUE),
carbohydrate = mean(carbohydrate, na.rm = TRUE),
sugar = mean(sugar, na.rm = TRUE),
protein = mean(protein, na.rm = TRUE),
servings = mean(servings, na.rm = TRUE),
.groups = 'drop' # Ungroup after summarising
)
# --- Pivot to Long Format for Plotting ---
# This is the standard R/ggplot2 way to prepare data for faceted plots
traffic_avg_long <- traffic_avg %>%
pivot_longer(
cols = c(calories, carbohydrate, sugar, protein, servings),
names_to = "nutrient", # The name of the new column for feature names
values_to = "average" # The name of the new column for the mean values
)
# --- Create the Faceted Bar Plot ---
# (Equivalent to Python's loop over subplots)
ggplot(traffic_avg_long, aes(x = category, y = average, fill = high_traffic)) +
# Create grouped bar chart
geom_bar(stat = "identity", position = position_dodge()) +
# Add value labels on top of bars, dodging them to match
geom_text(
aes(label = round(average, 1)),
position = position_dodge(width = 0.9),
vjust = -0.3, # Adjust vertical position to be just above the bar
size = 3
) +
# Create separate plots for each nutrient, stacked vertically (1 column)
# scales = "free_y" allows each plot to have its own y-axis range
facet_wrap(~ nutrient, ncol = 1, scales = "free_y") +
labs(
title = "Average Nutritional Values by Category & Traffic",
x = "Category",
y = "Average Value",
fill = "Traffic Level"
) +
# Rotate x-axis labels for readability
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
category and
high_traffic to factors# --- Model Development ---
# Prepare data for modeling
model_df <- df_clean %>%
# Drop the recipe identifier
select(-recipe) %>%
# Convert categorical variables to factors
mutate(
category = as.factor(category),
high_traffic = as.factor(high_traffic)
)
# Set the "High" level as the positive class for evaluation
# This ensures 'Precision' and 'Recall' are calculated for "High" traffic
model_df$high_traffic <- relevel(model_df$high_traffic, ref = "High")
# --- Train-Test Split ---
set.seed(42) # for reproducibility
trainIndex <- createDataPartition(model_df$high_traffic, p = 0.8,
list = FALSE,
times = 1)
train_data <- model_df[trainIndex, ]
test_data <- model_df[-trainIndex, ]
# --- Model Development & Evaluation ---
# Set up 10-fold cross-validation
# We use twoClassSummary to get ROC, Sensitivity, and Specificity
train_control <- trainControl(method = "cv", number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE)
# Baseline Model: Logistic Regression (with scaling)
set.seed(42)
model_lr <- train(
high_traffic ~ .,
data = train_data,
method = "glm",
family = "binomial",
preProcess = c("center", "scale"), # Scales numeric features
metric = "ROC", # Tune based on ROC AUC
trControl = train_control
)
print(model_lr)
## Generalized Linear Model
##
## 716 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## Pre-processing: centered (15), scaled (15)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 646, 644, 644, 644, 645, 645, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8254922 0.7918605 0.7257389
# 2. Comparison Model: Random Forest
# (Equivalent to RandomForestClassifier)
# Note: RF doesn't require scaling, so we omit preProcess
set.seed(42)
model_rf <- train(
high_traffic ~ .,
data = train_data,
method = "rf",
metric = "ROC",
trControl = train_control,
tuneLength = 3 # Checks 3 'mtry' values; similar to default RF
)
print(model_rf)
## Random Forest
##
## 716 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 646, 644, 644, 644, 645, 645, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.7904806 0.8082503 0.6597291
## 8 0.7905525 0.7849391 0.6075123
## 15 0.7875230 0.7964563 0.6142857
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 8.
# --- Model Evaluation ---
# Get predictions for the test set
preds_lr <- predict(model_lr, newdata = test_data)
preds_rf <- predict(model_rf, newdata = test_data)
# Generate Confusion Matrices
cm_lr <- confusionMatrix(preds_lr, test_data$high_traffic, positive = "High")
cm_rf <- confusionMatrix(preds_rf, test_data$high_traffic, positive = "High")
# Print Logistic Regression Confusion Matrix
print("--- Logistic Regression (Baseline) ---")
## [1] "--- Logistic Regression (Baseline) ---"
print(cm_lr)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 88 23
## Low 19 49
##
## Accuracy : 0.7654
## 95% CI : (0.6964, 0.8254)
## No Information Rate : 0.5978
## P-Value [Acc > NIR] : 1.683e-06
##
## Kappa : 0.5076
##
## Mcnemar's Test P-Value : 0.6434
##
## Sensitivity : 0.8224
## Specificity : 0.6806
## Pos Pred Value : 0.7928
## Neg Pred Value : 0.7206
## Prevalence : 0.5978
## Detection Rate : 0.4916
## Detection Prevalence : 0.6201
## Balanced Accuracy : 0.7515
##
## 'Positive' Class : High
##
# Print Random Forest Confusion Matrix
print("--- Random Forest (Comparison) ---")
## [1] "--- Random Forest (Comparison) ---"
print(cm_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 87 27
## Low 20 45
##
## Accuracy : 0.7374
## 95% CI : (0.6666, 0.8003)
## No Information Rate : 0.5978
## P-Value [Acc > NIR] : 6.447e-05
##
## Kappa : 0.4452
##
## Mcnemar's Test P-Value : 0.3815
##
## Sensitivity : 0.8131
## Specificity : 0.6250
## Pos Pred Value : 0.7632
## Neg Pred Value : 0.6923
## Prevalence : 0.5978
## Detection Rate : 0.4860
## Detection Prevalence : 0.6369
## Balanced Accuracy : 0.7190
##
## 'Positive' Class : High
##
# --- Create Comparison Table ---
stats_lr <- cm_lr$byClass
stats_rf <- cm_rf$byClass
results_table <- data.frame(
Metric = c("Accuracy", "Precision (High)", "Recall (High)", "F1-Score (High)"),
Logistic_Regression = c(
cm_lr$overall['Accuracy'],
stats_lr['Precision'],
stats_lr['Recall'],
stats_lr['F1']
),
Random_Forest = c(
cm_rf$overall['Accuracy'],
stats_rf['Precision'],
stats_rf['Recall'],
stats_rf['F1']
)
)
# Print the table using kable() for nice markdown formatting
kable(results_table, digits = 2, caption = "Model Performance Comparison")
| Metric | Logistic_Regression | Random_Forest | |
|---|---|---|---|
| Accuracy | Accuracy | 0.77 | 0.74 |
| Precision | Precision (High) | 0.79 | 0.76 |
| Recall | Recall (High) | 0.82 | 0.81 |
| F1 | F1-Score (High) | 0.81 | 0.79 |
# Load the gridExtra library to arrange plots side-by-side
# (You might need to install this: install.packages("gridExtra"))
library(gridExtra)
# Prepare data for Logistic Regression Plot ---
# Convert the table to a data.frame for ggplot
lr_table <- as.data.frame(cm_lr$table)
# Create Logistic Regression Plot (p_lr) ---
p_lr <- ggplot(lr_table, aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile(color = "black") + # Add black borders to tiles
geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
# Use a blue color scale, from light to dark
scale_fill_distiller(palette = "Blues", direction = 1) +
labs(
title = "Baseline (Logistic Regression) Confusion Matrix",
x = "Predicted Label",
y = "True Label"
) +
theme_minimal() +
theme(legend.position = "none") # Hide the fill legend
# Prepare data for Random Forest Plot ---
# Convert the table to a data.frame for ggplot
rf_table <- as.data.frame(cm_rf$table)
# Create Random Forest Plot (p_rf) ---
p_rf <- ggplot(rf_table, aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
# Use a green color scale, from light to dark
scale_fill_distiller(palette = "Greens", direction = 1) +
labs(
title = "Comparison (Random Forest) Confusion Matrix",
x = "Predicted Label",
y = "True Label"
) +
theme_minimal() +
theme(legend.position = "none")
# Arrange plots side-by-side ---
combined_plots <- grid.arrange(p_lr, p_rf, ncol = 2)
# --- Save the combined plot to a file ---
# ggsave can save objects created by grid.arrange
ggsave(
"Confusion_Matrices_comparing_both_models.png",
plot = combined_plots,
width = 12,
height = 5,
dpi = 300
)
# Display the combined plot in the report
print(combined_plots)
## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]