warning=FALSE
message=FALSE

Source: (2025). Parade.com. https://parade.com/.image/t_share/MTk0NDkxMDIwNTUyNTc4NzM3/fruit-to-eat-every-day.jpg

## Introduction 

The chose the food.csv dataset from the United States Department of Agriculture’s (USDA) Food Composition Database. The dataset includes information about different food items and their nutritional values. I chose this dataset because I'm interested in nutrition and how food affects health and I believe food literacy should be promoted more to encourage informed eating decisions in people's day to day lives. 

The variables I’ll be using in this project are:

Categorical:
Avocados 
Blueberries 
Salmon
Almonds
Lentils 

Quantitative:
Vitamin A
Vitamin C
Vitamin K
Vitamin B6
Vitamin E

For my final visualization, I will explore the relationship between Vitamin C and Vitamin K across all food categories. Since there are no missing or unsual values in this dataset, I am not cleaning it. 

## Background Information 

Vitamin-C is a water-soluble vitamin involved in protein metabolism that is required for the biosynthesis of collagen, L-carnitine, certain neurotransmitters. It is essential in wound healing, a crucial component of connective tissue, and an important physiological antioxidant. Unlike most animals, the human body is not able to synthesize vitamin C, making it a dietary necessarity. Vitamin K is a fat-soluble vitamin that ensures blood clots properly and promotes bone health, like Vitamin C, the blood-clotting effect of vitamin K is a vital aspect of wound-healing. 

Source: National Institutes of Health. (2021, March 26). Vitamin C. National Institutes of Health; National Institutes of Health. https://ods.od.nih.gov/factsheets/VitaminC-HealthProfessional/

## Load Libraries


``` r
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.4.3
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Load Dataset

setwd("C:/Users/Hana Rose/OneDrive/Data 110")
food <- read_csv("food.csv")
## Rows: 7083 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Category, Description
## dbl (36): Nutrient Data Bank Number, Data.Alpha Carotene, Data.Beta Carotene...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Select Foods

foods <- c("Avocado, raw", # Create the foods vector
           "Blueberries, raw",
           "Salmon, raw",
           "Almonds, unsalted",
           "Lentils, from dried, no added fat")

Select Vitamins

vitamins <- c("Data.Vitamins.Vitamin A - RAE",  # Create the vitamins vector
              "Data.Vitamins.Vitamin C",
              "Data.Vitamins.Vitamin K", 
              "Data.Vitamins.Vitamin B6",
              "Data.Vitamins.Vitamin E")

Create Nutrition Dataframe

nutrition <- food %>%
  filter(Description %in% foods) %>% # Filter for rows in the description column that match the values in the foods vector
  select(Description, all_of(vitamins)) # Select the description and vitamins columns
summary(nutrition) # Summarize the new nutrition dataframe
##  Description        Data.Vitamins.Vitamin A - RAE Data.Vitamins.Vitamin C
##  Length:5           Min.   : 0                    Min.   : 0.00          
##  Class :character   1st Qu.: 0                    1st Qu.: 0.00          
##  Mode  :character   Median : 3                    Median : 1.40          
##                     Mean   : 9                    Mean   : 4.22          
##                     3rd Qu.: 7                    3rd Qu.: 9.70          
##                     Max.   :35                    Max.   :10.00          
##  Data.Vitamins.Vitamin K Data.Vitamins.Vitamin B6 Data.Vitamins.Vitamin E
##  Min.   : 0.40           Min.   :0.052            Min.   : 0.110         
##  1st Qu.: 1.70           1st Qu.:0.132            1st Qu.: 0.400         
##  Median : 3.50           Median :0.168            Median : 0.570         
##  Mean   : 9.18           Mean   :0.244            Mean   : 5.338         
##  3rd Qu.:19.30           3rd Qu.:0.257            3rd Qu.: 2.070         
##  Max.   :21.00           Max.   :0.611            Max.   :23.540

Multiple Linear Regression Model

model <- lm(`Data.Vitamins.Vitamin C` ~ `Data.Vitamins.Vitamin K`, data = nutrition) # Fit a linear regression model with Vitamin C as the dependent variable and Vitamin K as the independent variable, using the 'nutrition' dataframe
summary(model) # Summarize model 
## 
## Call:
## lm(formula = `Data.Vitamins.Vitamin C` ~ `Data.Vitamins.Vitamin K`, 
##     data = nutrition)
## 
## Residuals:
##       1       2       3       4       5 
##  0.2169  0.9600 -1.3496 -0.1932  0.3659 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)   
## (Intercept)               -0.41907    0.63372  -0.661  0.55568   
## `Data.Vitamins.Vitamin K`  0.50535    0.04922  10.267  0.00197 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9935 on 3 degrees of freedom
## Multiple R-squared:  0.9723, Adjusted R-squared:  0.9631 
## F-statistic: 105.4 on 1 and 3 DF,  p-value: 0.00197

Diagnostic Plots

plot(model) # Plot the model 

Linear Regression Analysis

The results of the linear regression analysis show Vitamin K as a predictor for Vitamin C has a highly significant p-value of 0.00197, indicating 96.31% of the variation in Vitamin C content can be explained by the variation in Vitamin K. This means that Vitamin K has a strong positive association with Vitamin C in avocados, blueberries, salmon, almonds, and lentils. This correlation is likely caused by the co-presence of vitamin K and Vitamin C in plant-based foods as a result of vitamin K being used in photosynthesis and vitamin C being used in stress protection, growth, hormone production, and enzyme function in plants. This explains why this connection is particularly high in blueberries and avocados as opposed to salmon or non-photosynthetically active plants like almonds. Due to the statistically significant relationship between Vitamin C and Vitamin K in the selected foods, they will be the vitamins I’ll explore in my visualization.

Assign Colors to Food Categories

food_colors <- c( #Create a vector for the food colors
  "Blueberries, raw" = "lightblue", # Assign each food to a color
  "Salmon, raw" = "pink", 
  "Lentils, from dried, no added fat" = "orange",
  "Almonds, unsalted" = "brown",
  "Avocado, raw" = "green"
)

Create Color Dataframe

nutrition_colors <- nutrition %>% # Add a new column to the nutrition dataframe
  mutate(color = food_colors[Description]) # Add a color column that maps the food descriptions to their colors 

Visualization

hc <- highchart() %>% # Initialize highcharter object
  hc_chart(type = "scatter") %>% # Make the plot a scatterplot
  hc_title(text = "Vitamin C vs Vitamin K in Selected Foods") %>% # Add title
  hc_xAxis(title = list(text = "Vitamin K (µg)")) %>% # Label x-axis
  hc_yAxis(title = list(text = "Vitamin C (mg)")) %>% # Label y-axis
  hc_add_series( # Add series 
    data = list_parse(nutrition_colors %>% transmute( 
      x = `Data.Vitamins.Vitamin K`, # Assign Vitamin K data to the x-axis
      y = `Data.Vitamins.Vitamin C`, # Assign Vitamin C data to the y-axis
      name = Description, # Label each point as the food description
      color = color # Assign colors using the color column
    )),
    type = "scatter", # Use a scatterplot for this series
    name = "Foods", # Label the series
    marker = list(radius = 6), # Set the size of the dots
    showInLegend = FALSE  # Disable the legend
  ) %>%
  hc_tooltip(pointFormat = paste( # Create a tooltip
    "<b>{point.name}</b><br>", # Show food description
    "Vitamin C: {point.y} mg<br>", # Show vitamin C value
    "Vitamin K: {point.x} µg" # Show vitamin K value
  )) %>%
  hc_caption(text = "Source: USDA Food Data Central") %>%          # Put source in caption
  hc_add_theme(hc_theme_darkunica())                               # Change theme

# Print the plot
hc