Data Validation and Cleaning

This section details the inspection and cleaning steps performed on each column before analysis.

Firstly, I load the data and perform initial cleaning steps based on the validation findings.

# Load the dataset

df <- read_csv("recipe_site_traffic_2212.csv")
## Rows: 947 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): recipe, category, servings, high_traffic
## dbl (4): calories, carbohydrate, sugar, protein
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display the first 5 rows 
print("Initial Data Head:")
## [1] "Initial Data Head:"
kable(head(df))
recipe calories carbohydrate sugar protein category servings high_traffic
001 NA NA NA NA Pork 6 High
002 35.48 38.56 0.66 0.92 Potato 4 High
003 914.28 42.68 3.09 2.88 Breakfast 1 NA
004 97.03 30.56 38.63 0.02 Beverages 4 High
005 27.05 1.85 0.80 0.53 Beverages 4 NA
006 691.15 3.46 1.65 53.93 One Dish Meal 2 High
n_distinct(df$servings)
## [1] 6
unique(df$servings)
## [1] "6"            "4"            "1"            "2"            "4 as a snack"
## [6] "6 as a snack"
# Check data structure 
print("Data Structure (glimpse):")
## [1] "Data Structure (glimpse):"
glimpse(df)
## Rows: 947
## Columns: 8
## $ recipe       <chr> "001", "002", "003", "004", "005", "006", "007", "008", "…
## $ calories     <dbl> NA, 35.48, 914.28, 97.03, 27.05, 691.15, 183.94, 299.14, …
## $ carbohydrate <dbl> NA, 38.56, 42.68, 30.56, 1.85, 3.46, 47.95, 3.17, 3.78, 4…
## $ sugar        <dbl> NA, 0.66, 3.09, 38.63, 0.80, 1.65, 9.75, 0.40, 3.37, 3.99…
## $ protein      <dbl> NA, 0.92, 2.88, 0.02, 0.53, 53.93, 46.71, 32.40, 3.79, 11…
## $ category     <chr> "Pork", "Potato", "Breakfast", "Beverages", "Beverages", …
## $ servings     <chr> "6", "4", "1", "4", "4", "2", "4", "4", "6", "2", "1", "6…
## $ high_traffic <chr> "High", "High", NA, "High", NA, "High", NA, NA, "High", N…
# Check for missing values 
print("Initial Missing Value Counts:")
## [1] "Initial Missing Value Counts:"
sapply(df, function(x) sum(is.na(x)))
##       recipe     calories carbohydrate        sugar      protein     category 
##            0           52           52           52           52            0 
##     servings high_traffic 
##            0          373
# --- Data Cleaning Pipeline ---
df_clean <- df %>%
# Impute missing 'high_traffic' values with "Low"
  mutate(high_traffic = replace_na(high_traffic, "Low")) %>%
  
#  Drop rows with NA in 'calories' (also drops NAs in carbohydrate, sugar, protein)
  drop_na(calories) %>%
  
#  Clean and convert 'servings' column
  mutate(
    servings = str_replace(servings, "as a snack", ""), # Remove "as a snack"
    servings = as.integer(str_trim(servings))            # Trim whitespace and convert to integer
  ) %>%
  
#  Ensure 'protein' is numeric (as noted in markdown)
  mutate(protein = as.numeric(protein))
# --- Validation Checks After Cleaning ---

# Check final NA counts
print("Missing Value Counts After Cleaning:")
## [1] "Missing Value Counts After Cleaning:"
sapply(df_clean, function(x) sum(is.na(x)))
##       recipe     calories carbohydrate        sugar      protein     category 
##            0            0            0            0            0            0 
##     servings high_traffic 
##            0            0
# Check 'servings' column values 
print("Cleaned 'servings' Counts:")
## [1] "Cleaned 'servings' Counts:"
print(table(df_clean$servings))
## 
##   1   2   4   6 
## 169 174 367 185
# Display summary statistics 
print("Summary Statistics After Cleaning:")
## [1] "Summary Statistics After Cleaning:"
summary(df_clean)
##     recipe             calories        carbohydrate         sugar        
##  Length:895         Min.   :   0.14   Min.   :  0.030   Min.   :  0.010  
##  Class :character   1st Qu.: 110.43   1st Qu.:  8.375   1st Qu.:  1.690  
##  Mode  :character   Median : 288.55   Median : 21.480   Median :  4.550  
##                     Mean   : 435.94   Mean   : 35.070   Mean   :  9.047  
##                     3rd Qu.: 597.65   3rd Qu.: 44.965   3rd Qu.:  9.800  
##                     Max.   :3633.16   Max.   :530.420   Max.   :148.750  
##     protein          category            servings     high_traffic      
##  Min.   :  0.000   Length:895         Min.   :1.000   Length:895        
##  1st Qu.:  3.195   Class :character   1st Qu.:2.000   Class :character  
##  Median : 10.800   Mode  :character   Median :4.000   Mode  :character  
##  Mean   : 24.149                      Mean   :3.458                     
##  3rd Qu.: 30.200                      3rd Qu.:4.000                     
##  Max.   :363.360                      Max.   :6.000

Exploratory Data Analysis (EDA)

This section details the findings and interpretations from visualizing the cleaned data.

# high_traffic (Target Variable) Pie Chart

# Calculate counts and percentages
traffic_counts <- df_clean %>%
  group_by(high_traffic) %>%
  summarise(count = n(), .groups = 'drop') %>%
  mutate(percentage = count / sum(count))

print(traffic_counts)
## # A tibble: 2 × 3
##   high_traffic count percentage
##   <chr>        <int>      <dbl>
## 1 High           535      0.598
## 2 Low            360      0.402
# Plot pie chart
ggplot(traffic_counts, aes(x = "", y = percentage, fill = high_traffic)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  # Add labels (percentage and count)
  geom_text(aes(label = paste0(scales::percent(percentage, accuracy = 0.01), "\n(", count, ")")), 
            position = position_stack(vjust = 0.5)) +
  labs(title = "Recipe Traffic Distribution", fill = "Traffic") +
  theme_void()

  • Finding: The cleaned dataset consists of 59.78% High-traffic recipes and 40.22% Low-traffic recipes.

  • Interpretation: The dataset is slightly imbalanced, with a majority of recipes in the dataset driving high traffic.

# category Column Bar Chart
ggplot(df_clean, aes(y = category, fill = category)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), hjust = -0.1) +
  labs(title = "Number of Recipes per Category", x = "Count", y = "Category") +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

  • Finding: Breakfast is the most frequent category (106 entries), while One Dish Meal was the least frequent (71 entries).
# --- Filter data to only Chicken categories ---

chicken_categories <- c("Chicken", "Chicken Breast")

chicken_data <- df_clean %>%
  filter(category %in% chicken_categories)

# --- Calculate proportions ---

impact <- chicken_data %>%
  # Count combinations of category and high_traffic
  group_by(category, high_traffic) %>%
  summarise(count = n(), .groups = 'drop') %>%
  # Calculate proportion within each category
  group_by(category) %>%
  mutate(Proportion = count / sum(count))

# --- Print the impact table ---

impact_table_wide <- impact %>%
  select(-count) %>% # Drop the raw count
  pivot_wider(names_from = high_traffic, values_from = Proportion)

print("Proportion of High/Low Traffic for Chicken Categories:")
## [1] "Proportion of High/Low Traffic for Chicken Categories:"
kable(impact_table_wide, digits = 3)
category High Low
Chicken 0.362 0.638
Chicken Breast 0.468 0.532
# --- Plot the proportions ---

chicken_plot <- ggplot(impact, aes(x = category, y = Proportion, fill = high_traffic)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(
    title = "High Traffic Proportion for Chicken Categories",
    y = "Proportion",
    x = "Category",
    fill = "High Traffic"
  ) +
  theme_minimal()

#  Save the plot ---
ggsave("High_Traffic_Proportion_for_Chicken_Categories.png", plot = chicken_plot)

# --- Display the plot ---
print(chicken_plot)

# Nutritional Columns Boxplots


# We "pivot" the data to a long format for easy plotting with ggplot2
df_long <- df_clean %>%
  select(calories, carbohydrate, sugar, protein, high_traffic) %>%
  pivot_longer(
    cols = -high_traffic, 
    names_to = "nutrient", 
    values_to = "value"
  )
  
ggplot(df_long, aes(x = high_traffic, y = value, fill = high_traffic)) +
  geom_boxplot() +
  # Use facet_wrap to create separate plots for each nutrient
  facet_wrap(~ nutrient, scales = "free_y") +
  labs(title = "Nutritional Value Distribution by Traffic",
       x = "High Traffic", y = "Value") +
  theme_minimal()

  • Findings & Interpretation:

  • calories, carbohydrate, sugar: The high-traffic group had significantly more high-value outliers.

  • protein: The distributions for both groups were almost identical.

  • Interpretation: Exceptionally high-calorie, high-carbohydrate, and especially high-sugar “indulgent” recipes are significant drivers of high traffic. Protein content appears to have no impact.

# Correlation Analysis Heatmap

# Select only the numeric columns for correlation
numeric_df <- df_clean %>% select(calories, carbohydrate, sugar, protein, servings)

# Calculate correlation matrix
cor_matrix <- round(cor(numeric_df), 2)

# Melt the matrix for ggplot2
melted_cormat <- melt(cor_matrix)

# Plot the heatmap
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  # Add correlation values as text
  geom_text(aes(label = value), color = "black", size = 4) +
  # Define the color scale
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                    size = 12, hjust = 1)) +
  coord_fixed() +
  labs(title = "Correlation Heatmap of Numerical Features")

  • Finding: All correlations were very low (close to 0).

  • Interpretation: This is a positive result. It indicates no multicollinearity, meaning each feature provides unique and independent information for the model.

# --- Calculate Averages ---
# (Equivalent to Python's .groupby().mean().reset_index())
traffic_avg <- df_clean %>%
  # Group by the two categorical variables
  group_by(category, high_traffic) %>%
  # Calculate the mean for all numeric columns of interest
  summarise(
    calories = mean(calories, na.rm = TRUE),
    carbohydrate = mean(carbohydrate, na.rm = TRUE),
    sugar = mean(sugar, na.rm = TRUE),
    protein = mean(protein, na.rm = TRUE),
    servings = mean(servings, na.rm = TRUE),
    .groups = 'drop' # Ungroup after summarising
  )

# --- Pivot to Long Format for Plotting ---
# This is the standard R/ggplot2 way to prepare data for faceted plots
traffic_avg_long <- traffic_avg %>%
  pivot_longer(
    cols = c(calories, carbohydrate, sugar, protein, servings),
    names_to = "nutrient", # The name of the new column for feature names
    values_to = "average"  # The name of the new column for the mean values
  )

# --- Create the Faceted Bar Plot ---
# (Equivalent to Python's loop over subplots)
ggplot(traffic_avg_long, aes(x = category, y = average, fill = high_traffic)) +
  # Create grouped bar chart
  geom_bar(stat = "identity", position = position_dodge()) +
  
  # Add value labels on top of bars, dodging them to match
  geom_text(
    aes(label = round(average, 1)), 
    position = position_dodge(width = 0.9), 
    vjust = -0.3, # Adjust vertical position to be just above the bar
    size = 3
  ) +
  
  # Create separate plots for each nutrient, stacked vertically (1 column)
  # scales = "free_y" allows each plot to have its own y-axis range
  facet_wrap(~ nutrient, ncol = 1, scales = "free_y") +
  
  labs(
    title = "Average Nutritional Values by Category & Traffic",
    x = "Category",
    y = "Average Value",
    fill = "Traffic Level"
  ) +
  
  # Rotate x-axis labels for readability
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Model Development and Evaluation

Feature Engineering & Preprocessing

This is a Binary Classification Problem. Logistic Regression was used as a baseline and Random Forest as a comparison model.

  • I first prepare the data by converting category and high_traffic to factors
# --- Model Development ---

# Prepare data for modeling
model_df <- df_clean %>%
  # Drop the recipe identifier
  select(-recipe) %>%
  # Convert categorical variables to factors
  mutate(
    category = as.factor(category),
    high_traffic = as.factor(high_traffic)
  )

# Set the "High" level as the positive class for evaluation
# This ensures 'Precision' and 'Recall' are calculated for "High" traffic
model_df$high_traffic <- relevel(model_df$high_traffic, ref = "High")

# --- Train-Test Split ---
set.seed(42) # for reproducibility
trainIndex <- createDataPartition(model_df$high_traffic, p = 0.8, 
                                  list = FALSE, 
                                  times = 1)

train_data <- model_df[trainIndex, ]
test_data  <- model_df[-trainIndex, ]

Fitting a baseline model

# --- Model Development & Evaluation ---

# Set up 10-fold cross-validation
# We use twoClassSummary to get ROC, Sensitivity, and Specificity
train_control <- trainControl(method = "cv", number = 10, 
                              summaryFunction = twoClassSummary,
                              classProbs = TRUE)

# Baseline Model: Logistic Regression (with scaling)
set.seed(42)
model_lr <- train(
  high_traffic ~ ., 
  data = train_data, 
  method = "glm", 
  family = "binomial",
  preProcess = c("center", "scale"), # Scales numeric features
  metric = "ROC",                    # Tune based on ROC AUC
  trControl = train_control
)

print(model_lr)
## Generalized Linear Model 
## 
## 716 samples
##   6 predictor
##   2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (15), scaled (15) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 646, 644, 644, 644, 645, 645, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8254922  0.7918605  0.7257389

Fitting a Comparison Model — Random Forest Classifier

# 2. Comparison Model: Random Forest
# (Equivalent to RandomForestClassifier)
# Note: RF doesn't require scaling, so we omit preProcess
set.seed(42)
model_rf <- train(
  high_traffic ~ ., 
  data = train_data, 
  method = "rf", 
  metric = "ROC",
  trControl = train_control,
  tuneLength = 3 # Checks 3 'mtry' values; similar to default RF
)

print(model_rf)
## Random Forest 
## 
## 716 samples
##   6 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 646, 644, 644, 644, 645, 645, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.7904806  0.8082503  0.6597291
##    8    0.7905525  0.7849391  0.6075123
##   15    0.7875230  0.7964563  0.6142857
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 8.

Model Evaluation and Comparison

I evaluate both models on the unseen test data.

# --- Model Evaluation ---

# Get predictions for the test set
preds_lr <- predict(model_lr, newdata = test_data)
preds_rf <- predict(model_rf, newdata = test_data)

# Generate Confusion Matrices
cm_lr <- confusionMatrix(preds_lr, test_data$high_traffic, positive = "High")
cm_rf <- confusionMatrix(preds_rf, test_data$high_traffic, positive = "High")

# Print Logistic Regression Confusion Matrix
print("--- Logistic Regression (Baseline) ---")
## [1] "--- Logistic Regression (Baseline) ---"
print(cm_lr)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High   88  23
##       Low    19  49
##                                           
##                Accuracy : 0.7654          
##                  95% CI : (0.6964, 0.8254)
##     No Information Rate : 0.5978          
##     P-Value [Acc > NIR] : 1.683e-06       
##                                           
##                   Kappa : 0.5076          
##                                           
##  Mcnemar's Test P-Value : 0.6434          
##                                           
##             Sensitivity : 0.8224          
##             Specificity : 0.6806          
##          Pos Pred Value : 0.7928          
##          Neg Pred Value : 0.7206          
##              Prevalence : 0.5978          
##          Detection Rate : 0.4916          
##    Detection Prevalence : 0.6201          
##       Balanced Accuracy : 0.7515          
##                                           
##        'Positive' Class : High            
## 
# Print Random Forest Confusion Matrix
print("--- Random Forest (Comparison) ---")
## [1] "--- Random Forest (Comparison) ---"
print(cm_rf)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High   87  27
##       Low    20  45
##                                           
##                Accuracy : 0.7374          
##                  95% CI : (0.6666, 0.8003)
##     No Information Rate : 0.5978          
##     P-Value [Acc > NIR] : 6.447e-05       
##                                           
##                   Kappa : 0.4452          
##                                           
##  Mcnemar's Test P-Value : 0.3815          
##                                           
##             Sensitivity : 0.8131          
##             Specificity : 0.6250          
##          Pos Pred Value : 0.7632          
##          Neg Pred Value : 0.6923          
##              Prevalence : 0.5978          
##          Detection Rate : 0.4860          
##    Detection Prevalence : 0.6369          
##       Balanced Accuracy : 0.7190          
##                                           
##        'Positive' Class : High            
## 
# --- Create Comparison Table ---

stats_lr <- cm_lr$byClass
stats_rf <- cm_rf$byClass

results_table <- data.frame(
  Metric = c("Accuracy", "Precision (High)", "Recall (High)", "F1-Score (High)"),
  Logistic_Regression = c(
    cm_lr$overall['Accuracy'], 
    stats_lr['Precision'], 
    stats_lr['Recall'], 
    stats_lr['F1']
  ),
  Random_Forest = c(
    cm_rf$overall['Accuracy'], 
    stats_rf['Precision'], 
    stats_rf['Recall'], 
    stats_rf['F1']
  )
)

# Print the table using kable() for nice markdown formatting
kable(results_table, digits = 2, caption = "Model Performance Comparison")
Model Performance Comparison
Metric Logistic_Regression Random_Forest
Accuracy Accuracy 0.77 0.74
Precision Precision (High) 0.79 0.76
Recall Recall (High) 0.82 0.81
F1 F1-Score (High) 0.81 0.79
# Load the gridExtra library to arrange plots side-by-side
# (You might need to install this: install.packages("gridExtra"))
library(gridExtra)

#  Prepare data for Logistic Regression Plot ---
# Convert the table to a data.frame for ggplot
lr_table <- as.data.frame(cm_lr$table)

# Create Logistic Regression Plot (p_lr) ---
p_lr <- ggplot(lr_table, aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile(color = "black") + # Add black borders to tiles
  geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
  # Use a blue color scale, from light to dark
  scale_fill_distiller(palette = "Blues", direction = 1) +
  labs(
    title = "Baseline (Logistic Regression) Confusion Matrix",
    x = "Predicted Label",
    y = "True Label"
  ) +
  theme_minimal() +
  theme(legend.position = "none") # Hide the fill legend

# Prepare data for Random Forest Plot ---
# Convert the table to a data.frame for ggplot
rf_table <- as.data.frame(cm_rf$table)

# Create Random Forest Plot (p_rf) ---
p_rf <- ggplot(rf_table, aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile(color = "black") +
  geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
  # Use a green color scale, from light to dark
  scale_fill_distiller(palette = "Greens", direction = 1) +
  labs(
    title = "Comparison (Random Forest) Confusion Matrix",
    x = "Predicted Label",
    y = "True Label"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

#  Arrange plots side-by-side ---

combined_plots <- grid.arrange(p_lr, p_rf, ncol = 2)

# --- Save the combined plot to a file ---

# ggsave can save objects created by grid.arrange
ggsave(
  "Confusion_Matrices_comparing_both_models.png", 
  plot = combined_plots, 
  width = 12, 
  height = 5,
  dpi = 300
)

# Display the combined plot in the report
print(combined_plots)
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]