Assignment2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Load the dataset 
data <- read.csv("~/Downloads/choc_chip_cookie_ingredients.csv", stringsAsFactors = TRUE)
#dataset is loaded from the path

# for displaying the first few rows to understand the data structure
head(data)

##   X        Ingredient                                            Text
## 1 1 all purpose flour                      3.0 cups all purpose flour
## 2 2 all purpose flour       2.8000000000000003 cups all purpose flour
## 3 3 all purpose flour       1.1076923076923078 cups all purpose flour
## 4 4 all purpose flour 3.333333333333333 cups sifted all purpose flour
## 5 5 all purpose flour                      2.0 cups all purpose flour
## 6 6 all purpose flour           9.0 cups unbleached all purpose flour
##   Recipe_Index    Rating Quantity Unit
## 1         AR_1 0.9207246 3.000000  cup
## 2        AR_10 0.9051620 2.800000  cup
## 3       AR_101 0.6000000 1.107692  cup
## 4       AR_102 0.9375000 3.333333  cup
## 5       AR_103 0.8812500 2.000000  cup
## 6       AR_107 0.9272727 9.000000  cup

# for summary of the entire dataset
summary(data)

##        X                      Ingredient                            Text     
##  Min.   :   1.0   egg              :212   2.0 teaspoon baking soda    :  29  
##  1st Qu.: 498.2   vanilla          :196   1.0 teaspoon baking soda    :  26  
##  Median : 995.5   all purpose flour:193   1.0 teaspoon salt           :  23  
##  Mean   : 995.5   baking soda      :187   4.0 eggs                    :  19  
##  3rd Qu.:1492.8   sugar            :175   2.0 eggs                    :  16  
##  Max.   :1990.0   light brown sugar:170   2.0 teaspoon vanilla extract:  16  
##                   (Other)          :857   (Other)                     :1861  
##   Recipe_Index      Rating          Quantity              Unit     
##  AR_101 :  16   Min.   :0.3750   Min.   : 0.000   cup       :1111  
##  AR_4   :  15   1st Qu.:0.7500   1st Qu.: 1.000   egg       : 212  
##  AR_39  :  14   Median :0.8703   Median : 1.640   ounce     :   9  
##  AR_96  :  14   Mean   :0.8150   Mean   : 2.234   tablespoon:  23  
##  AR_110 :  13   3rd Qu.:0.9090   3rd Qu.: 2.667   teaspoon  : 635  
##  AR_14  :  13   Max.   :1.0000   Max.   :48.000                    
##  (Other):1905   NA's   :1010

dim(data)

## [1] 1990    7

# Combined summary for numeric and categorical columns

# Assuming 'Rating' and 'Quantity' are numeric, and 'Ingredient' and 'Unit' are categorical
# Numeric Summary
numeric_summary <- data %>%
  summarise(
    # For 'Rating' column
    Rating_Min = min(Rating, na.rm = TRUE),
    Rating_Max = max(Rating, na.rm = TRUE),
    Rating_Mean = mean(Rating, na.rm = TRUE),
    Rating_Median = median(Rating, na.rm = TRUE),
    Rating_SD = sd(Rating, na.rm = TRUE),
    Rating_Q1 = quantile(Rating, 0.25, na.rm = TRUE),
    Rating_Q3 = quantile(Rating, 0.75, na.rm = TRUE),
    
    # For 'Quantity' column
    Quantity_Min = min(Quantity, na.rm = TRUE),
    Quantity_Max = max(Quantity, na.rm = TRUE),
    Quantity_Mean = mean(Quantity, na.rm = TRUE),
    Quantity_Median = median(Quantity, na.rm = TRUE),
    Quantity_SD = sd(Quantity, na.rm = TRUE),
    Quantity_Q1 = quantile(Quantity, 0.25, na.rm = TRUE),
    Quantity_Q3 = quantile(Quantity, 0.75, na.rm = TRUE)
  )

print("Numeric Summary:")

## [1] "Numeric Summary:"

print(numeric_summary)

##   Rating_Min Rating_Max Rating_Mean Rating_Median Rating_SD Rating_Q1 Rating_Q3
## 1      0.375          1   0.8149856     0.8702532 0.1356323      0.75 0.9090164
##   Quantity_Min Quantity_Max Quantity_Mean Quantity_Median Quantity_SD
## 1            0           48      2.234415        1.639632    2.529849
##   Quantity_Q1 Quantity_Q3
## 1           1    2.666667

# Categorical Summary
cat_summary <- data %>%
  summarise(
    # For 'Ingredient' column
    Ingredient_Values = list(unique(Ingredient)),
    Ingredient_Counts = list(table(Ingredient)),
    
    # For 'Unit' column
    Unit_Values = list(unique(Unit)),
    Unit_Counts = list(table(Unit))
  )
print("Categorical Summary:")

## [1] "Categorical Summary:"

print(cat_summary)

##                                                                                                                                                                                                                                                       Ingredient_Values
## 1 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 14, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 33, 24, 7, 52, 25, 40, 65
##                                                                                                                                                                                                                        Ingredient_Counts
## 1 193, 4, 2, 1, 50, 187, 17, 1, 7, 1, 160, 6, 3, 1, 5, 2, 1, 1, 1, 2, 7, 1, 3, 19, 9, 212, 1, 1, 1, 2, 1, 1, 170, 2, 1, 2, 11, 1, 15, 31, 1, 1, 2, 9, 19, 2, 1, 11, 6, 1, 168, 159, 29, 2, 175, 3, 2, 196, 5, 3, 39, 4, 1, 3, 9, 1, 1, 1
##     Unit_Values           Unit_Counts
## 1 1, 5, 4, 3, 2 1111, 212, 9, 23, 635

# Combined summary output
combined_summary <- list(Numeric = numeric_summary, Categorical = cat_summary)
print("Combined Summary:")

## [1] "Combined Summary:"

print(combined_summary)

## $Numeric
##   Rating_Min Rating_Max Rating_Mean Rating_Median Rating_SD Rating_Q1 Rating_Q3
## 1      0.375          1   0.8149856     0.8702532 0.1356323      0.75 0.9090164
##   Quantity_Min Quantity_Max Quantity_Mean Quantity_Median Quantity_SD
## 1            0           48      2.234415        1.639632    2.529849
##   Quantity_Q1 Quantity_Q3
## 1           1    2.666667
## 
## $Categorical
##                                                                                                                                                                                                                                                       Ingredient_Values
## 1 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39, 14, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 33, 24, 7, 52, 25, 40, 65
##                                                                                                                                                                                                                        Ingredient_Counts
## 1 193, 4, 2, 1, 50, 187, 17, 1, 7, 1, 160, 6, 3, 1, 5, 2, 1, 1, 1, 2, 7, 1, 3, 19, 9, 212, 1, 1, 1, 2, 1, 1, 170, 2, 1, 2, 11, 1, 15, 31, 1, 1, 2, 9, 19, 2, 1, 11, 6, 1, 168, 159, 29, 2, 175, 3, 2, 196, 5, 3, 39, 4, 1, 3, 9, 1, 1, 1
##     Unit_Values           Unit_Counts
## 1 1, 5, 4, 3, 2 1111, 212, 9, 23, 635

# Aggregation: Calculate mean Rating grouped by Ingredient
agg_rating_by_ingredient <- data %>%
  group_by(Ingredient) %>%
  summarise(Mean_Rating = mean(Rating, na.rm = TRUE))
# Print the aggregation result
print("Mean Rating by Ingredient:")

## [1] "Mean Rating by Ingredient:"

print(agg_rating_by_ingredient)

## # A tibble: 68 × 2
##    Ingredient                 Mean_Rating
##    <fct>                            <dbl>
##  1 all purpose flour                0.816
##  2 almond extract                   0.829
##  3 almonds                          0.891
##  4 applesauce                       0.724
##  5 baking powder                    0.804
##  6 baking soda                      0.825
##  7 bittersweet chocolate chip       0.75 
##  8 bourbon                          0.933
##  9 bread flour                      0.5  
## 10 brown rice flour                 0.875
## # ℹ 58 more rows

# Visualization 1: Distribution of Ratings
ggplot(data, aes(x = Rating)) +
  geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
  labs(title = "Distribution of Ratings", x = "Rating", y = "Frequency")

## Warning: Removed 1010 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Visualization 2: Distribution of Quantity
ggplot(data, aes(x = Quantity)) +
  geom_histogram(binwidth = 0.5, fill = "green", color = "black") +
  labs(title = "Distribution of Quantity", x = "Quantity", y = "Frequency")

# Visualization 3: Scatter plot of Quantity vs. Rating, colored by Ingredient
ggplot(data, aes(x = Quantity, y = Rating, color = Ingredient)) +
  geom_point(size = 3) +
  labs(title = "Scatter Plot of Quantity vs. Rating by Ingredient",
       x = "Quantity", y = "Rating") +
  theme_minimal()

## Warning: Removed 1010 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Visualization 4: Box plot showing distribution of Ratings by Ingredient
ggplot(data, aes(x = Ingredient, y = Rating, fill = Ingredient)) +
  geom_boxplot() +
  labs(title = "Box Plot of Ratings by Ingredient",
       x = "Ingredient", y = "Rating") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Warning: Removed 1010 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Visualization 5: Correlation plot with trend line between Quantity and Rating
ggplot(data, aes(x = Quantity, y = Rating)) +
  geom_point(color = "purple") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Correlation between Quantity and Rating",
       x = "Quantity", y = "Rating") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1010 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1010 rows containing missing values or values outside the scale range
## (`geom_point()`).

#The data set is taken from the pudding data #It is about the list of ingredients used for choco chip cookies. #It represents the baking instructions from 211 chocolate chip cookie recipes.It represents the ingredient lists from 211 chocolate chip cookie recipes, all scaled to yield 48 servings and with the units standardized. # rating was available on the recipe, it is provided as a numeric value. All ratings are normalized to be between 0 and 1.Quantity of the ingredient,is a scaling to 48 servings

Assignment2

2024-09-10