This project was inspired by Ken Nordine

(audio included under ASCAP educational license)

The olive data set of the Data Science Labs Package describes the percentage composition of fatty acids in olive oil from a variety of regions and areas of Italy collected from J. Zupan, et al., 1994. We will look for correlations between the components and their origins and plot what looks interesting.

Background from The Olive Oil Source:

The fatty acid composition of olive oil varies widely depending on the cultivar, maturity of the fruit, altitude, climate, and several other factors.

The major fatty acids in olive oil triacylglycerols are:

The olive dataset also includes arachidic, eicosenoic, and palmitoleic acids.

load the required libraries:

library(dslabs)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(RColorBrewer)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

fetch, tibblelize and characterize the olive dataset

data(olive)
olive<-as.tibble(olive)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
# examine its structure:
str(olive)
## Classes 'tbl_df', 'tbl' and 'data.frame':    572 obs. of  10 variables:
##  $ region     : Factor w/ 3 levels "Northern Italy",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ area       : Factor w/ 9 levels "Calabria","Coast-Sardinia",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ palmitic   : num  10.75 10.88 9.11 9.66 10.51 ...
##  $ palmitoleic: num  0.75 0.73 0.54 0.57 0.67 0.49 0.66 0.61 0.6 0.55 ...
##  $ stearic    : num  2.26 2.24 2.46 2.4 2.59 2.68 2.64 2.35 2.39 2.13 ...
##  $ oleic      : num  78.2 77.1 81.1 79.5 77.7 ...
##  $ linoleic   : num  6.72 7.81 5.49 6.19 6.72 6.78 6.18 7.34 7.09 6.33 ...
##  $ linolenic  : num  0.36 0.31 0.31 0.5 0.5 0.51 0.49 0.39 0.46 0.26 ...
##  $ arachidic  : num  0.6 0.61 0.63 0.78 0.8 0.7 0.56 0.64 0.83 0.52 ...
##  $ eicosenoic : num  0.29 0.29 0.29 0.35 0.46 0.44 0.29 0.35 0.33 0.3 ...
# convert factors to characters
olive$region<-as.character(olive$region)
olive$area<-as.character(olive$area)
# check the summary
summary(olive)
##     region              area              palmitic      palmitoleic    
##  Length:572         Length:572         Min.   : 6.10   Min.   :0.1500  
##  Class :character   Class :character   1st Qu.:10.95   1st Qu.:0.8775  
##  Mode  :character   Mode  :character   Median :12.01   Median :1.1000  
##                                        Mean   :12.32   Mean   :1.2609  
##                                        3rd Qu.:13.60   3rd Qu.:1.6925  
##                                        Max.   :17.53   Max.   :2.8000  
##     stearic          oleic          linoleic        linolenic     
##  Min.   :1.520   Min.   :63.00   Min.   : 4.480   Min.   :0.0000  
##  1st Qu.:2.050   1st Qu.:70.00   1st Qu.: 7.707   1st Qu.:0.2600  
##  Median :2.230   Median :73.03   Median :10.300   Median :0.3300  
##  Mean   :2.289   Mean   :73.12   Mean   : 9.805   Mean   :0.3189  
##  3rd Qu.:2.490   3rd Qu.:76.80   3rd Qu.:11.807   3rd Qu.:0.4025  
##  Max.   :3.750   Max.   :84.10   Max.   :14.700   Max.   :0.7400  
##    arachidic       eicosenoic    
##  Min.   :0.000   Min.   :0.0100  
##  1st Qu.:0.500   1st Qu.:0.0200  
##  Median :0.610   Median :0.1700  
##  Mean   :0.581   Mean   :0.1628  
##  3rd Qu.:0.700   3rd Qu.:0.2800  
##  Max.   :1.050   Max.   :0.5800

make sure that the total percentages are acceptably close to 100

totalPcts<-olive %>%
  select(-region,-area) %>%
  rowSums()

summary(totalPcts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   98.11   99.70   99.96   99.85  100.01  101.02
length(totalPcts)
## [1] 572

look at the distribution of observations by region and areas

  olive %>%
  select(region,area) %>%
  group_by(region, area) %>%
  summarise(count=n())
## # A tibble: 9 x 3
## # Groups:   region [3]
##   region         area            count
##   <chr>          <chr>           <int>
## 1 Northern Italy East-Liguria       50
## 2 Northern Italy Umbria             51
## 3 Northern Italy West-Liguria       50
## 4 Sardinia       Coast-Sardinia     33
## 5 Sardinia       Inland-Sardinia    65
## 6 Southern Italy Calabria           56
## 7 Southern Italy North-Apulia       25
## 8 Southern Italy Sicily             36
## 9 Southern Italy South-Apulia      206

Look at country wide fatty acid correlations:

options(width = 300)
corFA<-as.data.frame( cor(olive[,3:10]))
print(as_tibble(corFA),digits = 2, width=300)
## # A tibble: 8 x 8
##   palmitic palmitoleic stearic  oleic linoleic linolenic arachidic eicosenoic
##      <dbl>       <dbl>   <dbl>  <dbl>    <dbl>     <dbl>     <dbl>      <dbl>
## 1    1          0.836  -0.170  -0.837   0.461     0.319     0.228      0.502 
## 2    0.836      1      -0.222  -0.852   0.622     0.0931    0.0855     0.416 
## 3   -0.170     -0.222   1       0.114  -0.198     0.0189   -0.0410     0.140 
## 4   -0.837     -0.852   0.114   1      -0.850    -0.218    -0.320     -0.424 
## 5    0.461      0.622  -0.198  -0.850   1        -0.0574    0.211      0.0890
## 6    0.319      0.0931  0.0189 -0.218  -0.0574    1         0.620      0.578 
## 7    0.228      0.0855 -0.0410 -0.320   0.211     0.620     1          0.329 
## 8    0.502      0.416   0.140  -0.424   0.0890    0.578     0.329      1

summarise absolute correlations

mask<-upper.tri(corFA,diag = FALSE)
f<-function(a) if (a) 1 else NA

# eliminate lower triangle
corFA<-as.matrix(corFA*sapply(mask, f))
print(corFA)
##             palmitic palmitoleic    stearic      oleic   linoleic   linolenic   arachidic  eicosenoic
## palmitic          NA    0.835605 -0.1703918 -0.8373354  0.4606845  0.31932669  0.22829912  0.50195179
## palmitoleic       NA          NA -0.2221854 -0.8524384  0.6216267  0.09311163  0.08548117  0.41635048
## stearic           NA          NA         NA  0.1135987 -0.1978169  0.01891719 -0.04097892  0.14037748
## oleic             NA          NA         NA         NA -0.8503184 -0.21817123 -0.31996234 -0.42414586
## linoleic          NA          NA         NA         NA         NA -0.05743858  0.21097260  0.08904499
## linolenic         NA          NA         NA         NA         NA          NA  0.62023577  0.57831851
## arachidic         NA          NA         NA         NA         NA          NA          NA  0.32866349
## eicosenoic        NA          NA         NA         NA         NA          NA          NA          NA
summary(abs(as.numeric(corFA)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.01892 0.13368 0.27381 0.35192 0.52104 0.85244      36
# save third quartile as threshold cut off
q3<-summary(abs(as.numeric(corFA)))[5]

identify all pairs with significant correlations >= 0.5210435

f<-function(a) if (is.na(a)) NA else if(abs(a)>=q3) 1 else NA
sig<-(corFA*sapply(corFA, f))

print(sig,digits = 2)
##             palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
## palmitic          NA        0.84      NA -0.84       NA        NA        NA         NA
## palmitoleic       NA          NA      NA -0.85     0.62        NA        NA         NA
## stearic           NA          NA      NA    NA       NA        NA        NA         NA
## oleic             NA          NA      NA    NA    -0.85        NA        NA         NA
## linoleic          NA          NA      NA    NA       NA        NA        NA         NA
## linolenic         NA          NA      NA    NA       NA        NA      0.62       0.58
## arachidic         NA          NA      NA    NA       NA        NA        NA         NA
## eicosenoic        NA          NA      NA    NA       NA        NA        NA         NA
fa2<-as.character(sapply(rownames(corFA),FUN=function(b) paste(b,rownames(corFA),sep = "-")))

a<-unlist(mapply(FUN=function(a,b) if (!is.na(a)) paste(b,a ),as.numeric(sig),fa2))
for ( i in a) print(i)
## [1] "palmitoleic-palmitic 0.835604968040802"
## [1] "oleic-palmitic -0.837335352032493"
## [1] "oleic-palmitoleic -0.852438351564888"
## [1] "linoleic-palmitoleic 0.621626663681314"
## [1] "linoleic-oleic -0.850318366225752"
## [1] "arachidic-linolenic 0.620235772288214"
## [1] "eicosenoic-linolenic 0.578318507487605"

This suggests three plots:

  • %oleic vs %( palmitic, linoleic, palmitoleic )
  • %linolenic vs %( arachidic, eicosenoic )
  • %palmitoleic vs %( palmitic, linoleic)

using color filled by area and labeled by region abbreviation.

Map region to region abbreviations so it fits on a map label:

This did not show up on plots (too crowded to fit?)

olive <- olive %>%
  mutate(regionAbbr=
           if (grepl("North",region)[1]) "NI" # northern Italy
          else if (grepl("South",region)[1]) "SI" # southern Italy
          else "Sar" # Sardinia
  )  

Model linear regression: oleic ~ palmitic + linoleic + palmitoleic

lm<-lm(oleic ~  palmitic + linoleic + palmitoleic, olive)
summary(lm)
## 
## Call:
## lm(formula = oleic ~ palmitic + linoleic + palmitoleic, data = olive)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.42546 -0.35781  0.02799  0.43925  1.85301 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 98.64750    0.29112  338.86  < 2e-16 ***
## palmitic    -1.26566    0.02884  -43.89  < 2e-16 ***
## linoleic    -0.95697    0.01404  -68.14  < 2e-16 ***
## palmitoleic -0.44179    0.10495   -4.21 2.97e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6322 on 568 degrees of freedom
## Multiple R-squared:  0.9759, Adjusted R-squared:  0.9757 
## F-statistic:  7652 on 3 and 568 DF,  p-value: < 2.2e-16
# create predicted column
model <- predict(lm,se=TRUE)
olive <-olive %>%
  mutate(predicted = model$fit) %>%
  # set the residual boundary lines at the 96th percentile on either side of the regression line.
  mutate(upperSe= predicted + 1.96*model$se.fit) %>%
  mutate(lowerSe= predicted - 1.96*model$se.fit)

Plot 1:

# lm<-lm(oleic ~  palmitic + linoleic + palmitoleic, olive)
plot1<-olive %>%
  ggplot(aes(y=predicted,
             x=oleic, 
             color=area, 
             labelR=region,
             labelP1=palmitic,
             labelP2=palmitoleic,
             labelL=linoleic
             )
        )+
    theme_classic(base_size = 12)+
    geom_point()+
  # plot regression line with standard error:
    geom_smooth(aes(ymin=lowerSe,ymax=upperSe),
                color="black",
                se=TRUE,
                method = 'loess',
                formula = 'y ~ x',
                stat="smooth"
    )+
    scale_fill_distiller(palette = "Set2")+
  theme(
    plot.title = element_text( size=10, face="plain",lineheight =.8,hjust = .5 ,vjust = 1),
    axis.title.x = element_text(size=12, face="plain"),
    legend.title = element_blank()
)+
    labs(title="\nOleic Acid ~ Palmitic + Linoleic + Palmitoleic",
          x="Percentage Oleic Acid",
          y="Predicted Percentage Oleic Acid"
    )


ggplotly(plot1)

Fatty Acid Content of Italian Olive Oils

Model linear regression: %linolenic ~ arachidic + eicosenoic

lm<-lm(linolenic ~ arachidic +  eicosenoic, olive)
summary(lm)
## 
## Call:
## lm(formula = linolenic ~ arachidic + eicosenoic, data = olive)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16907 -0.06204 -0.01048  0.04701  0.40868 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09100    0.01044   8.718   <2e-16 ***
## arachidic    0.28389    0.01769  16.052   <2e-16 ***
## eicosenoic   0.38659    0.02767  13.974   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08793 on 569 degrees of freedom
## Multiple R-squared:  0.5419, Adjusted R-squared:  0.5403 
## F-statistic: 336.5 on 2 and 569 DF,  p-value: < 2.2e-16
# create predicted column

model <- predict(lm,se=TRUE)
olive <-olive %>%
  mutate(predicted = model$fit) %>%
  mutate(upperSe= predicted + 1.96*model$se.fit) %>%
  mutate(lowerSe= predicted - 1.96*model$se.fit)

Plot 2:

plot2<-olive %>%
  ggplot(aes(y=predicted,
             x=linolenic, 
             color=area, 
             labelR=region,
             labelP1=arachidic,
             labelP2=eicosenoic,
             )
        )+
    theme_classic(base_size = 12)+
    geom_point()+
  #plot regression line
  geom_smooth(aes(ymin=lowerSe,ymax=upperSe),
                color="black",
                se=TRUE,
                method = 'loess',
                formula = 'y ~ x',
                stat="smooth"
    )+
    
    scale_fill_distiller(palette = "Set2")+
  theme(
    plot.title = element_text( size=10, face="plain",lineheight =.8,hjust = .5,vjust = 1),
    axis.title.x = element_text(size=12, face="plain"),
    legend.title = element_blank()
)+
    labs(title="\nLinolenic Acid ~ arachidic + eicosenoic",
          x="Percentage Linolenic Acid",
          y="Predicted Percentage Linolenic Acid"
    )

ggplotly(plot2)

Fatty Acid Content of Italian Olive Oils

Model linear regression: %palmitoleic ~ palmitic + linoleic

lm<-lm(palmitoleic ~ palmitic + linoleic, olive)
summary(lm)
## 
## Call:
## lm(formula = palmitoleic ~ palmitic + linoleic, data = olive)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81194 -0.16714 -0.01239  0.14850  1.10347 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.049903   0.078348  -26.16   <2e-16 ***
## palmitic     0.217086   0.007063   30.73   <2e-16 ***
## linoleic     0.064956   0.004904   13.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2526 on 569 degrees of freedom
## Multiple R-squared:  0.7693, Adjusted R-squared:  0.7685 
## F-statistic: 948.9 on 2 and 569 DF,  p-value: < 2.2e-16
# create predicted column

model <- predict(lm,se=TRUE)
olive <-olive %>%
  mutate(predicted = model$fit) %>%
  mutate(upperSe= predicted + 1.96*model$se.fit) %>%
  mutate(lowerSe= predicted - 1.96*model$se.fit)

Plot 3:

plot3<-olive %>%
  ggplot(aes(y=predicted,
             x=palmitoleic, 
             color=area, 
             labelR=region,
             labelP1=palmitic,
             labelP2=linoleic
             )
        )+
    theme_classic(base_size = 12)+
    geom_point()+
  #plot regression line
  geom_smooth(aes(ymin=lowerSe,ymax=upperSe),
                color="black",
                se=TRUE,
                method = 'loess',
                formula = 'y ~ x',
                stat="smooth"
    )+
    
    scale_fill_distiller(palette = "Set2")+
  theme(
    plot.title = element_text( size=10, face="plain",lineheight =.8,hjust = .5,vjust = 1),
    axis.title.x = element_text(size=12, face="plain"),
    legend.title = element_blank()
)+
    labs(title="\nPalmitoleic ~ palmitic + linoleic",
          x="Percentage Palmitoleic Acid",
          y="Predicted Percentage Palmitoleic Acid"
    )

ggplotly(plot3)

Fatty Acid Content of Italian Olive Oils