Olive Oil Chemical Composition

# Load the packages needed for this assignment
library(dslabs)      # Contains the olive dataset
library(ggplot2)     # For creating the static plot
library(dplyr)       # For data manipulation

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(highcharter) # For the interactive version of the plot
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Attaching package: 'highcharter'
The following object is masked from 'package:dslabs':

    stars
# Load the olive dataset from dslabs
data("olive")

# Preview the first few rows to understand the structure
head(olive)
          region         area palmitic palmitoleic stearic oleic linoleic
1 Southern Italy North-Apulia    10.75        0.75    2.26 78.23     6.72
2 Southern Italy North-Apulia    10.88        0.73    2.24 77.09     7.81
3 Southern Italy North-Apulia     9.11        0.54    2.46 81.13     5.49
4 Southern Italy North-Apulia     9.66        0.57    2.40 79.52     6.19
5 Southern Italy North-Apulia    10.51        0.67    2.59 77.71     6.72
6 Southern Italy North-Apulia     9.11        0.49    2.68 79.24     6.78
  linolenic arachidic eicosenoic
1      0.36      0.60       0.29
2      0.31      0.61       0.29
3      0.31      0.63       0.29
4      0.50      0.78       0.35
5      0.50      0.80       0.46
6      0.51      0.70       0.44
# Start the ggplot and tell it to use the olive dataset
# aes() maps variables to visual properties:
# x-axis = oleic acid, y-axis = linoleic acid, color = region
ggplot(olive, aes(x = oleic, y = linoleic, color = region))

# Build the scatterplot step by step
# geom_point adds the actual dots to the plot
# alpha = 0.6 makes them slightly see-through so overlapping dots are visible
# size = 2.5 makes the dots a little bigger than the default
ggplot(olive, aes(x = oleic, y = linoleic, color = region)) +
  geom_point(alpha = 0.6, size = 2.5)

# geom_smooth adds a straight trend line for each region
# method = "lm" means linear (straight line, not curved)
# se = FALSE removes the shaded confidence band around the line
ggplot(olive, aes(x = oleic, y = linoleic, color = region)) +
  geom_point(alpha = 0.6, size = 2.5) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 0.8)
`geom_smooth()` using formula = 'y ~ x'

# theme_classic() removes the default gray background and grid lines
# scale_color_manual sets custom colors instead of ggplot's defaults
# Each region gets its own named color
ggplot(olive, aes(x = oleic, y = linoleic, color = region)) +
  geom_point(alpha = 0.6, size = 2.5) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 0.8) +
  theme_classic() +
  scale_color_manual(values = c("Northern Italy" = "orange",
                                "Sardinia"       = "steelblue",
                                "Southern Italy" = "forestgreen"))
`geom_smooth()` using formula = 'y ~ x'

# labs() adds the title, subtitle, axis labels, legend title, and caption
# This is the complete, finished ggplot chart
ggplot(olive, aes(x = oleic, y = linoleic, color = region)) +
  geom_point(alpha = 0.6, size = 2.5) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 0.8) +
  theme_classic() +
  scale_color_manual(values = c("Northern Italy" = "orange",
                                "Sardinia"       = "steelblue",
                                "Southern Italy" = "forestgreen")) +
  labs(
    title    = "Oleic vs. Linoleic Acid Content in Italian Olive Oils",
    subtitle = "Each point represents one olive oil sample; lines show regional trends",
    x        = "Oleic Acid (% composition)",
    y        = "Linoleic Acid (% composition)",
    color    = "Region of Origin",
    caption  = "Data source: DS Labs (dslabs R package)"
  )
`geom_smooth()` using formula = 'y ~ x'

# The highcharter package needs each group as a separate object
# We use filter() from dplyr to pull out each region individually
northern <- olive |> filter(region == "Northern Italy")
sardinia  <- olive |> filter(region == "Sardinia")
southern  <- olive |> filter(region == "Southern Italy")
# highchart() starts an empty interactive chart
# hc_add_series() adds one group of dots at a time
# list_parse2() converts the data frame into the format highcharter needs
# hc_xAxis and hc_yAxis add axis labels
# hc_tooltip shows the exact values when you hover over a dot
highchart() |>
  hc_add_series(data = list_parse2(data.frame(x = northern$oleic, y = northern$linoleic)),
                type = "scatter", name = "Northern Italy", color = "orange") |>
  hc_add_series(data = list_parse2(data.frame(x = sardinia$oleic, y = sardinia$linoleic)),
                type = "scatter", name = "Sardinia", color = "steelblue") |>
  hc_add_series(data = list_parse2(data.frame(x = southern$oleic, y = southern$linoleic)),
                type = "scatter", name = "Southern Italy", color = "forestgreen") |>
  hc_xAxis(title = list(text = "Oleic Acid (% composition)")) |>
  hc_yAxis(title = list(text = "Linoleic Acid (% composition)")) |>
  hc_title(text = "Oleic vs. Linoleic Acid Content in Italian Olive Oils") |>
  hc_subtitle(text = "Interactive – click a region in the legend to show/hide it") |>
  hc_tooltip(pointFormat = "Oleic: {point.x}<br>Linoleic: {point.y}")

For assignment 7, I used the olive dataset from dslabs package which contains the chemical composition for 572 olive oils samples from the following regions: Northern Italy, Sardinia, and Southern Italy. The samples contains percentages of eight different fatty acids found in the oil.

I compared two fatty acids found ion the oil: oleic acid and linoleic acid, and colored each point by region. I also added a trend line for each region to make the pattern easier to see.

One thing I noticed is that when oleic acid is high, linoleic acid tends to be low. This happens across all three reegions. I also noticed that Sardinia’s oils have lowered oleic acid levels compared to the other two regions, which shows that where the oil comes from affects its chemical makeup.

I made two versions of the chart; one using ggplot2 and one using highcharter, which is interactive and lets you hover over points and click the legends to show or hide each region.