# load tidyverse
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (dplyr)
install.packages("magrittr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
# Load and View dataset
nutri <- read.csv ("/cloud/project/ATu/nutrients4.csv")
head(nutri)
## Food Measure Grams Calories Protein Fat Sat.Fat Fiber Carbs
## 1 Cows' milk 1 qt. 976 660 32 40 36 0.0 48
## 2 Milk skim 1 qt. 984 360 36 NA NA 0.0 52
## 3 Buttermilk 1 cup 246 127 9 5 4 0.0 13
## 4 Evaporated, undiluted 1 cup 252 345 16 20 18 0.0 24
## 5 Fortified milk 6 cups 1,419 1373 89 42 23 1.4 119
## 6 Powdered milk 1 cup 103 515 27 28 24 0.0 39
## Category
## 1 Dairy products
## 2 Dairy products
## 3 Dairy products
## 4 Dairy products
## 5 Dairy products
## 6 Dairy products
# How caloric count varies across the different food categories
Category_Calories <- nutri %>%
group_by (Category)%>%
summarise(avg_calories = mean(Calories,na.rm = TRUE))
print(Category_Calories)
## # A tibble: 16 × 2
## Category avg_calories
## <chr> <dbl>
## 1 Breads, cereals, fastfood,grains 265.
## 2 Dairy products 301.
## 3 Desserts, sweets 228.
## 4 Drinks,Alcohol, Beverages 92.7
## 5 Fats, Oils, Shortenings 259.
## 6 Fish, Seafood 153.
## 7 Fruits A-F 151.
## 8 Fruits G-P 145.
## 9 Fruits R-Z 154.
## 10 Jams, Jellies 168.
## 11 Meat, Poultry 267.
## 12 Seeds and Nuts 341.
## 13 Soups 119.
## 14 Vegetables A-E 65.9
## 15 Vegetables F-P 50.8
## 16 Vegetables R-Z 99.8
# plot the relationship
ggplot(data = Category_Calories, mapping = aes(x = Category, y = avg_calories))+
geom_col(identity="stat", color = 'blue')+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_col(identity = "stat", color = "blue"): Ignoring unknown
## parameters: `identity`

# Category with the highest average protein content per serving
avg_protein_cat <- nutri %>%
group_by (Category)%>%
summarise(avg_protein = mean (Protein,na.rm = TRUE)) # Average protein
max_protein_cat <- avg_protein_cat %>%
arrange (desc(avg_protein)) %>%
slice(1,2,3,4,5)
print(max_protein_cat) # max protein content per serving
## # A tibble: 5 × 2
## Category avg_protein
## <chr> <dbl>
## 1 Fish, Seafood 31.9
## 2 Fats, Oils, Shortenings 29.2
## 3 Meat, Poultry 18.3
## 4 Dairy products 18.0
## 5 Seeds and Nuts 10
# Visualization
ggplot(data=max_protein_cat, mapping=aes(x=Category, y=avg_protein, fill=Category))+
geom_col(identity="stat", color="black")+labs(title="Foods with Maximum Protein",
x="Category",
y="Average Protein")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning in geom_col(identity = "stat", color = "black"): Ignoring unknown
## parameters: `identity`

# Descriptive statistics for the nutrients
summary(nutri)
## Food Measure Grams Calories
## Length:335 Length:335 Length:335 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 75.0
## Mode :character Mode :character Mode :character Median : 132.0
## Mean : 189.3
## 3rd Qu.: 250.0
## Max. :1373.0
## NA's :2
## Protein Fat Sat.Fat Fiber
## Min. : -1.000 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.000 1st Qu.: 1.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 4.000 Median : 8.00 Median : 0.000 Median : 0.400
## Mean : 9.703 Mean : 13.06 Mean : 6.742 Mean : 2.577
## 3rd Qu.: 13.000 3rd Qu.: 14.00 3rd Qu.: 8.000 3rd Qu.: 1.200
## Max. :232.000 Max. :233.00 Max. :234.000 Max. :235.000
## NA's :39 NA's :116 NA's :17 NA's :27
## Carbs Category
## Min. : 0.00 Length:335
## 1st Qu.: 6.00 Class :character
## Median : 15.00 Mode :character
## Mean : 25.99
## 3rd Qu.: 32.00
## Max. :236.00
## NA's :13
# Plot the relationship between protein and fiber
ggplot(nutri, aes(x = Protein, y = Fiber)) +
geom_point() +
theme_minimal() +
labs(title = "Scatter plot of Protein vs Fiber",
x = "Protein Content",
y = "Fiber Content")
## Warning: Removed 59 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Calculate the correlation coefficient
correlation_coefficient <- cor(nutri$Protein, nutri$Fiber, use = "complete.obs")
print(correlation_coefficient)
## [1] 0.8439945
# Visualize the relationship between fats and calories
ggplot(data=nutri, mapping = aes(x=Fat, y=Calories))+geom_point(color="red")
## Warning: Removed 116 rows containing missing values or values outside the scale range
## (`geom_point()`).

labs(title="Relationship between Fat and Calories",
x="Fat",
y="Calories") + theme_minimal()
## NULL
# Do foods with higher fat content have higher calorie counts?
fat_calories <- nutri %>%
group_by (Food) %>%
summarise(
avg_fat = mean (Fat, na.rm = TRUE),
avg_cal = mean (Calories, na.rm = TRUE)
)
# Arrange the foods with the top 5 highest fats
top_fat_foods <- fat_calories %>%
arrange (desc(avg_fat)) %>%
slice (1,2,3,4,5)
# print results
print(top_fat_foods)
## # A tibble: 5 × 3
## Food avg_fat avg_cal
## <chr> <dbl> <dbl>
## 1 Oysters 233 231
## 2 Lard 110 992
## 3 Hydrogenated cooking fat 100 665
## 4 Margarine 91 806
## 5 Butter 80.3 109.
# Plot the relationship
ggplot(data=top_fat_foods, mapping = aes(x=avg_fat, y=avg_cal, shape = Food))+
geom_point(size = 3, color="blue")+ theme_minimal()+
labs(title="Comparison of fats to calories: Top 5 foods",
x="Average_Fat",
y="Average Calories")

# Distribution of macro nutrients (protein, carbs, fats) against Category
gram_dist <- nutri %>%
group_by (Category) %>%
summarise (
avg_protein = mean (Protein, na.rm = TRUE),
avg_carbs = mean (Carbs, na.rm = TRUE),
avg_fats = mean (Fat, na.rm = TRUE)
)
print (gram_dist)
## # A tibble: 16 × 4
## Category avg_protein avg_carbs avg_fats
## <chr> <dbl> <dbl> <dbl>
## 1 Breads, cereals, fastfood,grains 9.16 45.8 5.75
## 2 Dairy products 18.0 28.3 16.5
## 3 Desserts, sweets 3.25 40.8 6.27
## 4 Drinks,Alcohol, Beverages 0 15.2 0
## 5 Fats, Oils, Shortenings 29.2 26.6 45.1
## 6 Fish, Seafood 31.9 14.6 18.3
## 7 Fruits A-F 1.71 36.9 4
## 8 Fruits G-P 1.22 36.0 6.25
## 9 Fruits R-Z 1.4 41.2 1
## 10 Jams, Jellies 0 43.1 0
## 11 Meat, Poultry 18.3 1.91 19.2
## 12 Seeds and Nuts 10 11.7 30.7
## 13 Soups 5.9 15.5 4.1
## 14 Vegetables A-E 4.35 12.7 1.5
## 15 Vegetables F-P 3.64 10.9 1
## 16 Vegetables R-Z 4.29 16.4 6.33
# Average saturated fat by food category
average_saturated_fat <- nutri %>%
group_by (Category) %>%
summarise(avg_sat_fat=mean(Sat.Fat, na.rm = TRUE))
# Food category with the highest average saturated fat content
highest_sat_fat <- average_saturated_fat %>%
arrange(desc(avg_sat_fat)) %>%
slice(1,2,3,4,5)
print(highest_sat_fat)
## # A tibble: 5 × 2
## Category avg_sat_fat
## <chr> <dbl>
## 1 Fats, Oils, Shortenings 38.3
## 2 Seeds and Nuts 19.3
## 3 Meat, Poultry 14.7
## 4 Dairy products 13.4
## 5 Fish, Seafood 13.3
# Visualization of the comparison
ggplot(data=highest_sat_fat, mapping=aes(x=Category, y=avg_sat_fat, fill = Category))+
geom_col() + labs(title="Foods with the highest Saturated Fat",
x="Food Category",
y="Average Saturated Fat") +
theme_minimal() + theme(axis.text.x = element_text (angle = 45, hjust = 1))

# Load package
install.packages("knitr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (knitr)
# Macro nutrients with the highest fiber
macro_fiber <- nutri %>%
group_by (Category) %>%
summarise (
avg_protein = mean(Protein, na.rm = TRUE),
avg_Carbs = mean(Carbs, na.rm = TRUE),
avg_fiber = mean(Fiber, na.rm = TRUE),
)
print(macro_fiber)
## # A tibble: 16 × 4
## Category avg_protein avg_Carbs avg_fiber
## <chr> <dbl> <dbl> <dbl>
## 1 Breads, cereals, fastfood,grains 9.16 45.8 3.13
## 2 Dairy products 18.0 28.3 0.157
## 3 Desserts, sweets 3.25 40.8 0.82
## 4 Drinks,Alcohol, Beverages 0 15.2 0
## 5 Fats, Oils, Shortenings 29.2 26.6 16.7
## 6 Fish, Seafood 31.9 14.6 13.1
## 7 Fruits A-F 1.71 36.9 1.52
## 8 Fruits G-P 1.22 36.0 0.879
## 9 Fruits R-Z 1.4 41.2 2.17
## 10 Jams, Jellies 0 43.1 1.14
## 11 Meat, Poultry 18.3 1.91 0
## 12 Seeds and Nuts 10 11.7 1.55
## 13 Soups 5.9 15.5 0.4
## 14 Vegetables A-E 4.35 12.7 1.34
## 15 Vegetables F-P 3.64 10.9 1.41
## 16 Vegetables R-Z 4.29 16.4 1.12
# Print the table using kable
print(kable(macro_fiber))
##
##
## |Category | avg_protein| avg_Carbs| avg_fiber|
## |:--------------------------------|-----------:|---------:|----------:|
## |Breads, cereals, fastfood,grains | 9.159091| 45.75556| 3.1327027|
## |Dairy products | 17.964286| 28.30435| 0.1571429|
## |Desserts, sweets | 3.250000| 40.82759| 0.8200000|
## |Drinks,Alcohol, Beverages | 0.000000| 15.18182| 0.0000000|
## |Fats, Oils, Shortenings | 29.250000| 26.55556| 16.7142857|
## |Fish, Seafood | 31.894737| 14.61111| 13.0555556|
## |Fruits A-F | 1.705882| 36.90909| 1.5227273|
## |Fruits G-P | 1.217391| 36.03571| 0.8791667|
## |Fruits R-Z | 1.400000| 41.25000| 2.1750000|
## |Jams, Jellies | 0.000000| 43.12500| 1.1428571|
## |Meat, Poultry | 18.300000| 1.91000| 0.0000000|
## |Seeds and Nuts | 10.000000| 11.66667| 1.5500000|
## |Soups | 5.900000| 15.50000| 0.4000000|
## |Vegetables A-E | 4.346154| 12.74286| 1.3444444|
## |Vegetables F-P | 3.636364| 10.92308| 1.4083333|
## |Vegetables R-Z | 4.291667| 16.39286| 1.1200000|
# Are there outliers in the dataset based on calorie content?
# load dplyr
library (dplyr)
# identify quartiles and interquartile range
quartiles <- quantile (nutri$Calories, probs=c(0.25,0.75),na.rm = TRUE)
Q1 <- quartiles [1]
Q2 <- quartiles [2]
IQR <- Q2 - Q1
# Establish lower and upper bounds to identify outliers
lower_bound <- Q1-1.5 * IQR
upper_bound <- Q2+1.5 * IQR
# Identify outliers
outliers <- nutri %>%
filter(Calories<lower_bound | Calories>upper_bound)
print(outliers)
## Food Measure Grams Calories Protein Fat Sat.Fat
## 1 Cows' milk 1 qt. 976 660 32 40 36
## 2 Fortified milk 6 cups 1,419 1373 89 42 23
## 3 Powdered milk 1 cup 103 515 27 28 24
## 4 (1/2 cup ice cream) 2 cups 540 690 24 24 22
## 5 Hydrogenated cooking fat 1/2 cup 100 665 0 100 88
## 6 Lard 1/2 cup 110 992 0 110 92
## 7 Margarine 1/2 cup 112 806 NA 91 76
## 8 Cranberry sauce sweetened 1 cup 277 530 NA NA 0
## 9 White, 20 slices, or 1-lb. loaf 454 1225 39 15 12
## 10 Whole-wheat 1-lb. loaf 454 1100 48 14 10
## 11 Rice 1 cup 208 748 15 3 0
## 12 Converted 1 cup 187 677 14 NA 0
## 13 White 1 cup 191 692 14 NA 0
## 14 Puddings Sugar 1 cup 200 770 0 0 0
## 15 Brown, firm-packed, dark sugar 1 cup 220 815 0 NA 0
## Fiber Carbs Category
## 1 0.0 48 Dairy products
## 2 1.4 119 Dairy products
## 3 0.0 39 Dairy products
## 4 0.0 70 Dairy products
## 5 0.0 0 Fats, Oils, Shortenings
## 6 0.0 0 Fats, Oils, Shortenings
## 7 0.0 NA Fats, Oils, Shortenings
## 8 1.2 142 Fruits A-F
## 9 9.0 229 Breads, cereals, fastfood,grains
## 10 67.5 216 Breads, cereals, fastfood,grains
## 11 1.2 154 Breads, cereals, fastfood,grains
## 12 0.4 142 Breads, cereals, fastfood,grains
## 13 0.3 150 Breads, cereals, fastfood,grains
## 14 0.0 199 Desserts, sweets
## 15 0.0 210 Jams, Jellies
# Relationship between fiber content and carbohydrate levels by Category
# Load ggplot2
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library (ggplot2)
fiber_carb_rel <- nutri %>%
group_by (Category) %>%
summarise (
avg_carbs = mean (Carbs, na.rm =TRUE),
avg_fiber = mean (Fiber, na.rm = TRUE)
)
print(fiber_carb_rel)
## # A tibble: 16 × 3
## Category avg_carbs avg_fiber
## <chr> <dbl> <dbl>
## 1 Breads, cereals, fastfood,grains 45.8 3.13
## 2 Dairy products 28.3 0.157
## 3 Desserts, sweets 40.8 0.82
## 4 Drinks,Alcohol, Beverages 15.2 0
## 5 Fats, Oils, Shortenings 26.6 16.7
## 6 Fish, Seafood 14.6 13.1
## 7 Fruits A-F 36.9 1.52
## 8 Fruits G-P 36.0 0.879
## 9 Fruits R-Z 41.2 2.17
## 10 Jams, Jellies 43.1 1.14
## 11 Meat, Poultry 1.91 0
## 12 Seeds and Nuts 11.7 1.55
## 13 Soups 15.5 0.4
## 14 Vegetables A-E 12.7 1.34
## 15 Vegetables F-P 10.9 1.41
## 16 Vegetables R-Z 16.4 1.12
# Visualize the relationship
ggplot(data=fiber_carb_rel, mapping=aes(x=avg_carbs, y=avg_fiber, color = Category))+
geom_point(size=3)
