library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the dataset
data <- read.csv("~/Downloads/choc_chip_cookie_ingredients.csv", stringsAsFactors = TRUE)
#dataset is loaded from the path
# for displaying the first few rows to understand the data structure
head(data)
## X Ingredient Text
## 1 1 all purpose flour 3.0 cups all purpose flour
## 2 2 all purpose flour 2.8000000000000003 cups all purpose flour
## 3 3 all purpose flour 1.1076923076923078 cups all purpose flour
## 4 4 all purpose flour 3.333333333333333 cups sifted all purpose flour
## 5 5 all purpose flour 2.0 cups all purpose flour
## 6 6 all purpose flour 9.0 cups unbleached all purpose flour
## Recipe_Index Rating Quantity Unit
## 1 AR_1 0.9207246 3.000000 cup
## 2 AR_10 0.9051620 2.800000 cup
## 3 AR_101 0.6000000 1.107692 cup
## 4 AR_102 0.9375000 3.333333 cup
## 5 AR_103 0.8812500 2.000000 cup
## 6 AR_107 0.9272727 9.000000 cup
# for summary of the entire dataset
summary(data)
## X Ingredient Text
## Min. : 1.0 egg :212 2.0 teaspoon baking soda : 29
## 1st Qu.: 498.2 vanilla :196 1.0 teaspoon baking soda : 26
## Median : 995.5 all purpose flour:193 1.0 teaspoon salt : 23
## Mean : 995.5 baking soda :187 4.0 eggs : 19
## 3rd Qu.:1492.8 sugar :175 2.0 eggs : 16
## Max. :1990.0 light brown sugar:170 2.0 teaspoon vanilla extract: 16
## (Other) :857 (Other) :1861
## Recipe_Index Rating Quantity Unit
## AR_101 : 16 Min. :0.3750 Min. : 0.000 cup :1111
## AR_4 : 15 1st Qu.:0.7500 1st Qu.: 1.000 egg : 212
## AR_39 : 14 Median :0.8703 Median : 1.640 ounce : 9
## AR_96 : 14 Mean :0.8150 Mean : 2.234 tablespoon: 23
## AR_110 : 13 3rd Qu.:0.9090 3rd Qu.: 2.667 teaspoon : 635
## AR_14 : 13 Max. :1.0000 Max. :48.000
## (Other):1905 NA's :1010
dim(data)
## [1] 1990 7
# Combined summary for numeric and categorical columns
# Assuming 'Rating' and 'Quantity' are numeric, and 'Ingredient' and 'Unit' are categorical
# Numeric Summary
numeric_summary <- data %>%
summarise(
# For 'Rating' column
Rating_Min = min(Rating, na.rm = TRUE),
Rating_Max = max(Rating, na.rm = TRUE),
Rating_Mean = mean(Rating, na.rm = TRUE),
Rating_Median = median(Rating, na.rm = TRUE),
Rating_SD = sd(Rating, na.rm = TRUE),
Rating_Q1 = quantile(Rating, 0.25, na.rm = TRUE),
Rating_Q3 = quantile(Rating, 0.75, na.rm = TRUE),
# For 'Quantity' column
Quantity_Min = min(Quantity, na.rm = TRUE),
Quantity_Max = max(Quantity, na.rm = TRUE),
Quantity_Mean = mean(Quantity, na.rm = TRUE),
Quantity_Median = median(Quantity, na.rm = TRUE),
Quantity_SD = sd(Quantity, na.rm = TRUE),
Quantity_Q1 = quantile(Quantity, 0.25, na.rm = TRUE),
Quantity_Q3 = quantile(Quantity, 0.75, na.rm = TRUE)
)
print("Numeric Summary:")
## [1] "Numeric Summary:"
print(numeric_summary)
## Rating_Min Rating_Max Rating_Mean Rating_Median Rating_SD Rating_Q1 Rating_Q3
## 1 0.375 1 0.8149856 0.8702532 0.1356323 0.75 0.9090164
## Quantity_Min Quantity_Max Quantity_Mean Quantity_Median Quantity_SD
## 1 0 48 2.234415 1.639632 2.529849
## Quantity_Q1 Quantity_Q3
## 1 1 2.666667
# Categorical Summary
cat_summary <- data %>%
summarise(
# For 'Ingredient' column
Ingredient_Values = list(unique(Ingredient)),
Ingredient_Counts = list(table(Ingredient)),
# For 'Unit' column
Unit_Values = list(unique(Unit)),
Unit_Counts = list(table(Unit))
)
print("Categorical Summary:")
## [1] "Categorical Summary:"
print(cat_summary)
## Ingredient_Values
## 1 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 14, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 33, 24, 7, 52, 25, 40, 65
## Ingredient_Counts
## 1 193, 4, 2, 1, 50, 187, 17, 1, 7, 1, 160, 6, 3, 1, 5, 2, 1, 1, 1, 2, 7, 1, 3, 19, 9, 212, 1, 1, 1, 2, 1, 1, 170, 2, 1, 2, 11, 1, 15, 31, 1, 1, 2, 9, 19, 2, 1, 11, 6, 1, 168, 159, 29, 2, 175, 3, 2, 196, 5, 3, 39, 4, 1, 3, 9, 1, 1, 1
## Unit_Values Unit_Counts
## 1 1, 5, 4, 3, 2 1111, 212, 9, 23, 635
# Combined summary output
combined_summary <- list(Numeric = numeric_summary, Categorical = cat_summary)
print("Combined Summary:")
## [1] "Combined Summary:"
print(combined_summary)
## $Numeric
## Rating_Min Rating_Max Rating_Mean Rating_Median Rating_SD Rating_Q1 Rating_Q3
## 1 0.375 1 0.8149856 0.8702532 0.1356323 0.75 0.9090164
## Quantity_Min Quantity_Max Quantity_Mean Quantity_Median Quantity_SD
## 1 0 48 2.234415 1.639632 2.529849
## Quantity_Q1 Quantity_Q3
## 1 1 2.666667
##
## $Categorical
## Ingredient_Values
## 1 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 14, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 33, 24, 7, 52, 25, 40, 65
## Ingredient_Counts
## 1 193, 4, 2, 1, 50, 187, 17, 1, 7, 1, 160, 6, 3, 1, 5, 2, 1, 1, 1, 2, 7, 1, 3, 19, 9, 212, 1, 1, 1, 2, 1, 1, 170, 2, 1, 2, 11, 1, 15, 31, 1, 1, 2, 9, 19, 2, 1, 11, 6, 1, 168, 159, 29, 2, 175, 3, 2, 196, 5, 3, 39, 4, 1, 3, 9, 1, 1, 1
## Unit_Values Unit_Counts
## 1 1, 5, 4, 3, 2 1111, 212, 9, 23, 635
# Aggregation: Calculate mean Rating grouped by Ingredient
agg_rating_by_ingredient <- data %>%
group_by(Ingredient) %>%
summarise(Mean_Rating = mean(Rating, na.rm = TRUE))
# Print the aggregation result
print("Mean Rating by Ingredient:")
## [1] "Mean Rating by Ingredient:"
print(agg_rating_by_ingredient)
## # A tibble: 68 × 2
## Ingredient Mean_Rating
## <fct> <dbl>
## 1 all purpose flour 0.816
## 2 almond extract 0.829
## 3 almonds 0.891
## 4 applesauce 0.724
## 5 baking powder 0.804
## 6 baking soda 0.825
## 7 bittersweet chocolate chip 0.75
## 8 bourbon 0.933
## 9 bread flour 0.5
## 10 brown rice flour 0.875
## # ℹ 58 more rows
# Visualization 1: Distribution of Ratings
ggplot(data, aes(x = Rating)) +
geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
labs(title = "Distribution of Ratings", x = "Rating", y = "Frequency")
## Warning: Removed 1010 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Visualization 2: Distribution of Quantity
ggplot(data, aes(x = Quantity)) +
geom_histogram(binwidth = 0.5, fill = "green", color = "black") +
labs(title = "Distribution of Quantity", x = "Quantity", y = "Frequency")
# Visualization 3: Scatter plot of Quantity vs. Rating, colored by Ingredient
ggplot(data, aes(x = Quantity, y = Rating, color = Ingredient)) +
geom_point(size = 3) +
labs(title = "Scatter Plot of Quantity vs. Rating by Ingredient",
x = "Quantity", y = "Rating") +
theme_minimal()
## Warning: Removed 1010 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Visualization 4: Box plot showing distribution of Ratings by Ingredient
ggplot(data, aes(x = Ingredient, y = Rating, fill = Ingredient)) +
geom_boxplot() +
labs(title = "Box Plot of Ratings by Ingredient",
x = "Ingredient", y = "Rating") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1010 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Visualization 5: Correlation plot with trend line between Quantity and Rating
ggplot(data, aes(x = Quantity, y = Rating)) +
geom_point(color = "purple") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Correlation between Quantity and Rating",
x = "Quantity", y = "Rating") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1010 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1010 rows containing missing values or values outside the scale range
## (`geom_point()`).
#The data set is taken from the pudding data #It is about the list of
ingredients used for choco chip cookies. #It represents the baking
instructions from 211 chocolate chip cookie recipes.It represents the
ingredient lists from 211 chocolate chip cookie recipes, all scaled to
yield 48 servings and with the units standardized. # rating was
available on the recipe, it is provided as a numeric value. All ratings
are normalized to be between 0 and 1.Quantity of the ingredient,is a
scaling to 48 servings