# load tidyverse

library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (dplyr)
install.packages("magrittr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
# Load and View dataset
nutri <- read.csv ("/cloud/project/ATu/nutrients4.csv")
head(nutri)
##                    Food Measure Grams Calories Protein Fat Sat.Fat Fiber Carbs
## 1            Cows' milk   1 qt.   976      660      32  40      36   0.0    48
## 2             Milk skim   1 qt.   984      360      36  NA      NA   0.0    52
## 3            Buttermilk   1 cup   246      127       9   5       4   0.0    13
## 4 Evaporated, undiluted   1 cup   252      345      16  20      18   0.0    24
## 5        Fortified milk  6 cups 1,419     1373      89  42      23   1.4   119
## 6         Powdered milk   1 cup   103      515      27  28      24   0.0    39
##         Category
## 1 Dairy products
## 2 Dairy products
## 3 Dairy products
## 4 Dairy products
## 5 Dairy products
## 6 Dairy products
# How caloric count varies across the different food categories
Category_Calories <- nutri %>%
  group_by (Category)%>%
  summarise(avg_calories = mean(Calories,na.rm = TRUE))
print(Category_Calories)
## # A tibble: 16 × 2
##    Category                         avg_calories
##    <chr>                                   <dbl>
##  1 Breads, cereals, fastfood,grains        265. 
##  2 Dairy products                          301. 
##  3 Desserts, sweets                        228. 
##  4 Drinks,Alcohol, Beverages                92.7
##  5 Fats, Oils, Shortenings                 259. 
##  6 Fish, Seafood                           153. 
##  7 Fruits A-F                              151. 
##  8 Fruits G-P                              145. 
##  9 Fruits R-Z                              154. 
## 10 Jams, Jellies                           168. 
## 11 Meat, Poultry                           267. 
## 12 Seeds and Nuts                          341. 
## 13 Soups                                   119. 
## 14 Vegetables A-E                           65.9
## 15 Vegetables F-P                           50.8
## 16 Vegetables R-Z                           99.8
# plot the relationship
ggplot(data = Category_Calories, mapping = aes(x = Category, y = avg_calories))+
  geom_col(identity="stat", color = 'blue')+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_col(identity = "stat", color = "blue"): Ignoring unknown
## parameters: `identity`

# Category with the highest average protein content per serving

avg_protein_cat <- nutri %>%
  group_by (Category)%>%
  summarise(avg_protein = mean (Protein,na.rm = TRUE)) # Average protein

max_protein_cat <- avg_protein_cat %>%
  arrange (desc(avg_protein)) %>%
  slice(1,2,3,4,5)
print(max_protein_cat) # max protein content per serving
## # A tibble: 5 × 2
##   Category                avg_protein
##   <chr>                         <dbl>
## 1 Fish, Seafood                  31.9
## 2 Fats, Oils, Shortenings        29.2
## 3 Meat, Poultry                  18.3
## 4 Dairy products                 18.0
## 5 Seeds and Nuts                 10
# Visualization
ggplot(data=max_protein_cat, mapping=aes(x=Category, y=avg_protein, fill=Category))+
  geom_col(identity="stat", color="black")+labs(title="Foods with Maximum Protein",
                                                x="Category",
                                                y="Average Protein")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_col(identity = "stat", color = "black"): Ignoring unknown
## parameters: `identity`

# Descriptive statistics for the nutrients
summary(nutri)
##      Food             Measure             Grams              Calories     
##  Length:335         Length:335         Length:335         Min.   :   0.0  
##  Class :character   Class :character   Class :character   1st Qu.:  75.0  
##  Mode  :character   Mode  :character   Mode  :character   Median : 132.0  
##                                                           Mean   : 189.3  
##                                                           3rd Qu.: 250.0  
##                                                           Max.   :1373.0  
##                                                           NA's   :2       
##     Protein             Fat            Sat.Fat            Fiber        
##  Min.   : -1.000   Min.   :  0.00   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.:  1.000   1st Qu.:  1.00   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median :  4.000   Median :  8.00   Median :  0.000   Median :  0.400  
##  Mean   :  9.703   Mean   : 13.06   Mean   :  6.742   Mean   :  2.577  
##  3rd Qu.: 13.000   3rd Qu.: 14.00   3rd Qu.:  8.000   3rd Qu.:  1.200  
##  Max.   :232.000   Max.   :233.00   Max.   :234.000   Max.   :235.000  
##  NA's   :39        NA's   :116      NA's   :17        NA's   :27       
##      Carbs          Category        
##  Min.   :  0.00   Length:335        
##  1st Qu.:  6.00   Class :character  
##  Median : 15.00   Mode  :character  
##  Mean   : 25.99                     
##  3rd Qu.: 32.00                     
##  Max.   :236.00                     
##  NA's   :13
# Plot the relationship between protein and fiber
ggplot(nutri, aes(x = Protein, y = Fiber)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Scatter plot of Protein vs Fiber",
       x = "Protein Content",
       y = "Fiber Content")
## Warning: Removed 59 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Calculate the correlation coefficient
correlation_coefficient <- cor(nutri$Protein, nutri$Fiber, use = "complete.obs")
print(correlation_coefficient)
## [1] 0.8439945
# Visualize the relationship between fats and calories
ggplot(data=nutri, mapping = aes(x=Fat, y=Calories))+geom_point(color="red")
## Warning: Removed 116 rows containing missing values or values outside the scale range
## (`geom_point()`).

  labs(title="Relationship between Fat and Calories",
       x="Fat",
       y="Calories") + theme_minimal()
## NULL
#  Do foods with higher fat content have higher calorie counts?
fat_calories <- nutri %>%
  group_by (Food) %>%
  summarise(
    avg_fat = mean (Fat, na.rm = TRUE),
    avg_cal = mean (Calories, na.rm = TRUE)
  )

# Arrange the foods with the top 5 highest fats
top_fat_foods <- fat_calories %>%
  arrange (desc(avg_fat)) %>%
slice (1,2,3,4,5)

# print results
print(top_fat_foods)
## # A tibble: 5 × 3
##   Food                     avg_fat avg_cal
##   <chr>                      <dbl>   <dbl>
## 1 Oysters                    233      231 
## 2 Lard                       110      992 
## 3 Hydrogenated cooking fat   100      665 
## 4 Margarine                   91      806 
## 5 Butter                      80.3    109.
# Plot the relationship
ggplot(data=top_fat_foods, mapping = aes(x=avg_fat, y=avg_cal, shape = Food))+
  geom_point(size = 3, color="blue")+ theme_minimal()+
  labs(title="Comparison of fats to calories: Top 5 foods",
       x="Average_Fat",
       y="Average Calories")

# Distribution of macro nutrients (protein, carbs, fats) against Category
gram_dist <- nutri %>%
  group_by (Category) %>%
  summarise (
    avg_protein = mean (Protein, na.rm = TRUE),
    avg_carbs = mean (Carbs, na.rm = TRUE),
    avg_fats = mean (Fat, na.rm = TRUE)
  )
print (gram_dist)
## # A tibble: 16 × 4
##    Category                         avg_protein avg_carbs avg_fats
##    <chr>                                  <dbl>     <dbl>    <dbl>
##  1 Breads, cereals, fastfood,grains        9.16     45.8      5.75
##  2 Dairy products                         18.0      28.3     16.5 
##  3 Desserts, sweets                        3.25     40.8      6.27
##  4 Drinks,Alcohol, Beverages               0        15.2      0   
##  5 Fats, Oils, Shortenings                29.2      26.6     45.1 
##  6 Fish, Seafood                          31.9      14.6     18.3 
##  7 Fruits A-F                              1.71     36.9      4   
##  8 Fruits G-P                              1.22     36.0      6.25
##  9 Fruits R-Z                              1.4      41.2      1   
## 10 Jams, Jellies                           0        43.1      0   
## 11 Meat, Poultry                          18.3       1.91    19.2 
## 12 Seeds and Nuts                         10        11.7     30.7 
## 13 Soups                                   5.9      15.5      4.1 
## 14 Vegetables A-E                          4.35     12.7      1.5 
## 15 Vegetables F-P                          3.64     10.9      1   
## 16 Vegetables R-Z                          4.29     16.4      6.33
# Average saturated fat by food category
average_saturated_fat <- nutri %>%
  group_by (Category) %>%
  summarise(avg_sat_fat=mean(Sat.Fat, na.rm = TRUE))

# Food category with the highest average saturated fat content
highest_sat_fat <- average_saturated_fat %>%
  arrange(desc(avg_sat_fat)) %>%
  slice(1,2,3,4,5)

print(highest_sat_fat)
## # A tibble: 5 × 2
##   Category                avg_sat_fat
##   <chr>                         <dbl>
## 1 Fats, Oils, Shortenings        38.3
## 2 Seeds and Nuts                 19.3
## 3 Meat, Poultry                  14.7
## 4 Dairy products                 13.4
## 5 Fish, Seafood                  13.3
# Visualization of the comparison
ggplot(data=highest_sat_fat, mapping=aes(x=Category, y=avg_sat_fat, fill = Category))+
  geom_col() + labs(title="Foods with the highest Saturated Fat",
                                      x="Food Category",
                                      y="Average Saturated Fat") +
  theme_minimal() + theme(axis.text.x = element_text (angle = 45, hjust = 1))

# Load package
install.packages("knitr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (knitr)

# Macro nutrients with the highest fiber
macro_fiber <- nutri %>%
  group_by (Category) %>%
  summarise (
    avg_protein = mean(Protein, na.rm = TRUE),
    avg_Carbs = mean(Carbs, na.rm = TRUE),
    avg_fiber = mean(Fiber, na.rm = TRUE),
  )
    
    print(macro_fiber)
## # A tibble: 16 × 4
##    Category                         avg_protein avg_Carbs avg_fiber
##    <chr>                                  <dbl>     <dbl>     <dbl>
##  1 Breads, cereals, fastfood,grains        9.16     45.8      3.13 
##  2 Dairy products                         18.0      28.3      0.157
##  3 Desserts, sweets                        3.25     40.8      0.82 
##  4 Drinks,Alcohol, Beverages               0        15.2      0    
##  5 Fats, Oils, Shortenings                29.2      26.6     16.7  
##  6 Fish, Seafood                          31.9      14.6     13.1  
##  7 Fruits A-F                              1.71     36.9      1.52 
##  8 Fruits G-P                              1.22     36.0      0.879
##  9 Fruits R-Z                              1.4      41.2      2.17 
## 10 Jams, Jellies                           0        43.1      1.14 
## 11 Meat, Poultry                          18.3       1.91     0    
## 12 Seeds and Nuts                         10        11.7      1.55 
## 13 Soups                                   5.9      15.5      0.4  
## 14 Vegetables A-E                          4.35     12.7      1.34 
## 15 Vegetables F-P                          3.64     10.9      1.41 
## 16 Vegetables R-Z                          4.29     16.4      1.12
# Print the table using kable
  print(kable(macro_fiber))
## 
## 
## |Category                         | avg_protein| avg_Carbs|  avg_fiber|
## |:--------------------------------|-----------:|---------:|----------:|
## |Breads, cereals, fastfood,grains |    9.159091|  45.75556|  3.1327027|
## |Dairy products                   |   17.964286|  28.30435|  0.1571429|
## |Desserts, sweets                 |    3.250000|  40.82759|  0.8200000|
## |Drinks,Alcohol, Beverages        |    0.000000|  15.18182|  0.0000000|
## |Fats, Oils, Shortenings          |   29.250000|  26.55556| 16.7142857|
## |Fish, Seafood                    |   31.894737|  14.61111| 13.0555556|
## |Fruits A-F                       |    1.705882|  36.90909|  1.5227273|
## |Fruits G-P                       |    1.217391|  36.03571|  0.8791667|
## |Fruits R-Z                       |    1.400000|  41.25000|  2.1750000|
## |Jams, Jellies                    |    0.000000|  43.12500|  1.1428571|
## |Meat, Poultry                    |   18.300000|   1.91000|  0.0000000|
## |Seeds and Nuts                   |   10.000000|  11.66667|  1.5500000|
## |Soups                            |    5.900000|  15.50000|  0.4000000|
## |Vegetables A-E                   |    4.346154|  12.74286|  1.3444444|
## |Vegetables F-P                   |    3.636364|  10.92308|  1.4083333|
## |Vegetables R-Z                   |    4.291667|  16.39286|  1.1200000|
# Are there outliers in the dataset based on calorie content?

# load dplyr
library (dplyr)

# identify quartiles and interquartile range
quartiles <- quantile (nutri$Calories, probs=c(0.25,0.75),na.rm = TRUE)
Q1 <- quartiles [1]
Q2 <- quartiles [2]
IQR <- Q2 - Q1

# Establish lower and upper bounds to identify outliers
lower_bound <- Q1-1.5 * IQR
upper_bound <- Q2+1.5 * IQR

# Identify outliers
outliers <- nutri %>%
filter(Calories<lower_bound | Calories>upper_bound)
print(outliers)
##                              Food    Measure Grams Calories Protein Fat Sat.Fat
## 1                      Cows' milk      1 qt.   976      660      32  40      36
## 2                  Fortified milk     6 cups 1,419     1373      89  42      23
## 3                   Powdered milk      1 cup   103      515      27  28      24
## 4             (1/2 cup ice cream)     2 cups   540      690      24  24      22
## 5        Hydrogenated cooking fat    1/2 cup   100      665       0 100      88
## 6                            Lard    1/2 cup   110      992       0 110      92
## 7                       Margarine    1/2 cup   112      806      NA  91      76
## 8       Cranberry sauce sweetened      1 cup   277      530      NA  NA       0
## 9            White, 20 slices, or 1-lb. loaf   454     1225      39  15      12
## 10                    Whole-wheat 1-lb. loaf   454     1100      48  14      10
## 11                           Rice      1 cup   208      748      15   3       0
## 12                      Converted      1 cup   187      677      14  NA       0
## 13                          White      1 cup   191      692      14  NA       0
## 14                 Puddings Sugar      1 cup   200      770       0   0       0
## 15 Brown, firm-packed, dark sugar      1 cup   220      815       0  NA       0
##    Fiber Carbs                         Category
## 1    0.0    48                   Dairy products
## 2    1.4   119                   Dairy products
## 3    0.0    39                   Dairy products
## 4    0.0    70                   Dairy products
## 5    0.0     0          Fats, Oils, Shortenings
## 6    0.0     0          Fats, Oils, Shortenings
## 7    0.0    NA          Fats, Oils, Shortenings
## 8    1.2   142                       Fruits A-F
## 9    9.0   229 Breads, cereals, fastfood,grains
## 10  67.5   216 Breads, cereals, fastfood,grains
## 11   1.2   154 Breads, cereals, fastfood,grains
## 12   0.4   142 Breads, cereals, fastfood,grains
## 13   0.3   150 Breads, cereals, fastfood,grains
## 14   0.0   199                 Desserts, sweets
## 15   0.0   210                    Jams, Jellies
# Relationship between fiber content and carbohydrate levels by Category

# Load ggplot2
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (ggplot2)

fiber_carb_rel <- nutri %>%
  group_by (Category) %>%
  summarise (
    avg_carbs = mean (Carbs, na.rm =TRUE),
    avg_fiber = mean (Fiber, na.rm = TRUE)
  )
print(fiber_carb_rel)
## # A tibble: 16 × 3
##    Category                         avg_carbs avg_fiber
##    <chr>                                <dbl>     <dbl>
##  1 Breads, cereals, fastfood,grains     45.8      3.13 
##  2 Dairy products                       28.3      0.157
##  3 Desserts, sweets                     40.8      0.82 
##  4 Drinks,Alcohol, Beverages            15.2      0    
##  5 Fats, Oils, Shortenings              26.6     16.7  
##  6 Fish, Seafood                        14.6     13.1  
##  7 Fruits A-F                           36.9      1.52 
##  8 Fruits G-P                           36.0      0.879
##  9 Fruits R-Z                           41.2      2.17 
## 10 Jams, Jellies                        43.1      1.14 
## 11 Meat, Poultry                         1.91     0    
## 12 Seeds and Nuts                       11.7      1.55 
## 13 Soups                                15.5      0.4  
## 14 Vegetables A-E                       12.7      1.34 
## 15 Vegetables F-P                       10.9      1.41 
## 16 Vegetables R-Z                       16.4      1.12
# Visualize the relationship
ggplot(data=fiber_carb_rel, mapping=aes(x=avg_carbs, y=avg_fiber, color = Category))+
  geom_point(size=3)