(audio included under ASCAP educational license)
The olive data set of the Data Science Labs Package describes the percentage composition of fatty acids in olive oil from a variety of regions and areas of Italy collected from J. Zupan, et al., 1994. We will look for correlations between the components and their origins and plot what looks interesting.
Background from The Olive Oil Source:
The fatty acid composition of olive oil varies widely depending on the cultivar, maturity of the fruit, altitude, climate, and several other factors.
The major fatty acids in olive oil triacylglycerols are:
The olive dataset also includes arachidic, eicosenoic, and palmitoleic acids.
library(dslabs)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(RColorBrewer)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data(olive)
olive<-as.tibble(olive)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
# examine its structure:
str(olive)
## Classes 'tbl_df', 'tbl' and 'data.frame': 572 obs. of 10 variables:
## $ region : Factor w/ 3 levels "Northern Italy",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ area : Factor w/ 9 levels "Calabria","Coast-Sardinia",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ palmitic : num 10.75 10.88 9.11 9.66 10.51 ...
## $ palmitoleic: num 0.75 0.73 0.54 0.57 0.67 0.49 0.66 0.61 0.6 0.55 ...
## $ stearic : num 2.26 2.24 2.46 2.4 2.59 2.68 2.64 2.35 2.39 2.13 ...
## $ oleic : num 78.2 77.1 81.1 79.5 77.7 ...
## $ linoleic : num 6.72 7.81 5.49 6.19 6.72 6.78 6.18 7.34 7.09 6.33 ...
## $ linolenic : num 0.36 0.31 0.31 0.5 0.5 0.51 0.49 0.39 0.46 0.26 ...
## $ arachidic : num 0.6 0.61 0.63 0.78 0.8 0.7 0.56 0.64 0.83 0.52 ...
## $ eicosenoic : num 0.29 0.29 0.29 0.35 0.46 0.44 0.29 0.35 0.33 0.3 ...
# convert factors to characters
olive$region<-as.character(olive$region)
olive$area<-as.character(olive$area)
# check the summary
summary(olive)
## region area palmitic palmitoleic
## Length:572 Length:572 Min. : 6.10 Min. :0.1500
## Class :character Class :character 1st Qu.:10.95 1st Qu.:0.8775
## Mode :character Mode :character Median :12.01 Median :1.1000
## Mean :12.32 Mean :1.2609
## 3rd Qu.:13.60 3rd Qu.:1.6925
## Max. :17.53 Max. :2.8000
## stearic oleic linoleic linolenic
## Min. :1.520 Min. :63.00 Min. : 4.480 Min. :0.0000
## 1st Qu.:2.050 1st Qu.:70.00 1st Qu.: 7.707 1st Qu.:0.2600
## Median :2.230 Median :73.03 Median :10.300 Median :0.3300
## Mean :2.289 Mean :73.12 Mean : 9.805 Mean :0.3189
## 3rd Qu.:2.490 3rd Qu.:76.80 3rd Qu.:11.807 3rd Qu.:0.4025
## Max. :3.750 Max. :84.10 Max. :14.700 Max. :0.7400
## arachidic eicosenoic
## Min. :0.000 Min. :0.0100
## 1st Qu.:0.500 1st Qu.:0.0200
## Median :0.610 Median :0.1700
## Mean :0.581 Mean :0.1628
## 3rd Qu.:0.700 3rd Qu.:0.2800
## Max. :1.050 Max. :0.5800
totalPcts<-olive %>%
select(-region,-area) %>%
rowSums()
summary(totalPcts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 98.11 99.70 99.96 99.85 100.01 101.02
length(totalPcts)
## [1] 572
olive %>%
select(region,area) %>%
group_by(region, area) %>%
summarise(count=n())
## # A tibble: 9 x 3
## # Groups: region [3]
## region area count
## <chr> <chr> <int>
## 1 Northern Italy East-Liguria 50
## 2 Northern Italy Umbria 51
## 3 Northern Italy West-Liguria 50
## 4 Sardinia Coast-Sardinia 33
## 5 Sardinia Inland-Sardinia 65
## 6 Southern Italy Calabria 56
## 7 Southern Italy North-Apulia 25
## 8 Southern Italy Sicily 36
## 9 Southern Italy South-Apulia 206
options(width = 300)
corFA<-as.data.frame( cor(olive[,3:10]))
print(as_tibble(corFA),digits = 2, width=300)
## # A tibble: 8 x 8
## palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.836 -0.170 -0.837 0.461 0.319 0.228 0.502
## 2 0.836 1 -0.222 -0.852 0.622 0.0931 0.0855 0.416
## 3 -0.170 -0.222 1 0.114 -0.198 0.0189 -0.0410 0.140
## 4 -0.837 -0.852 0.114 1 -0.850 -0.218 -0.320 -0.424
## 5 0.461 0.622 -0.198 -0.850 1 -0.0574 0.211 0.0890
## 6 0.319 0.0931 0.0189 -0.218 -0.0574 1 0.620 0.578
## 7 0.228 0.0855 -0.0410 -0.320 0.211 0.620 1 0.329
## 8 0.502 0.416 0.140 -0.424 0.0890 0.578 0.329 1
mask<-upper.tri(corFA,diag = FALSE)
f<-function(a) if (a) 1 else NA
# eliminate lower triangle
corFA<-as.matrix(corFA*sapply(mask, f))
print(corFA)
## palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
## palmitic NA 0.835605 -0.1703918 -0.8373354 0.4606845 0.31932669 0.22829912 0.50195179
## palmitoleic NA NA -0.2221854 -0.8524384 0.6216267 0.09311163 0.08548117 0.41635048
## stearic NA NA NA 0.1135987 -0.1978169 0.01891719 -0.04097892 0.14037748
## oleic NA NA NA NA -0.8503184 -0.21817123 -0.31996234 -0.42414586
## linoleic NA NA NA NA NA -0.05743858 0.21097260 0.08904499
## linolenic NA NA NA NA NA NA 0.62023577 0.57831851
## arachidic NA NA NA NA NA NA NA 0.32866349
## eicosenoic NA NA NA NA NA NA NA NA
summary(abs(as.numeric(corFA)))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.01892 0.13368 0.27381 0.35192 0.52104 0.85244 36
# save third quartile as threshold cut off
q3<-summary(abs(as.numeric(corFA)))[5]
f<-function(a) if (is.na(a)) NA else if(abs(a)>=q3) 1 else NA
sig<-(corFA*sapply(corFA, f))
print(sig,digits = 2)
## palmitic palmitoleic stearic oleic linoleic linolenic arachidic eicosenoic
## palmitic NA 0.84 NA -0.84 NA NA NA NA
## palmitoleic NA NA NA -0.85 0.62 NA NA NA
## stearic NA NA NA NA NA NA NA NA
## oleic NA NA NA NA -0.85 NA NA NA
## linoleic NA NA NA NA NA NA NA NA
## linolenic NA NA NA NA NA NA 0.62 0.58
## arachidic NA NA NA NA NA NA NA NA
## eicosenoic NA NA NA NA NA NA NA NA
fa2<-as.character(sapply(rownames(corFA),FUN=function(b) paste(b,rownames(corFA),sep = "-")))
a<-unlist(mapply(FUN=function(a,b) if (!is.na(a)) paste(b,a ),as.numeric(sig),fa2))
for ( i in a) print(i)
## [1] "palmitoleic-palmitic 0.835604968040802"
## [1] "oleic-palmitic -0.837335352032493"
## [1] "oleic-palmitoleic -0.852438351564888"
## [1] "linoleic-palmitoleic 0.621626663681314"
## [1] "linoleic-oleic -0.850318366225752"
## [1] "arachidic-linolenic 0.620235772288214"
## [1] "eicosenoic-linolenic 0.578318507487605"
using color filled by area and labeled by region abbreviation.
This did not show up on plots (too crowded to fit?)
olive <- olive %>%
mutate(regionAbbr=
if (grepl("North",region)[1]) "NI" # northern Italy
else if (grepl("South",region)[1]) "SI" # southern Italy
else "Sar" # Sardinia
)
lm<-lm(oleic ~ palmitic + linoleic + palmitoleic, olive)
summary(lm)
##
## Call:
## lm(formula = oleic ~ palmitic + linoleic + palmitoleic, data = olive)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.42546 -0.35781 0.02799 0.43925 1.85301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.64750 0.29112 338.86 < 2e-16 ***
## palmitic -1.26566 0.02884 -43.89 < 2e-16 ***
## linoleic -0.95697 0.01404 -68.14 < 2e-16 ***
## palmitoleic -0.44179 0.10495 -4.21 2.97e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6322 on 568 degrees of freedom
## Multiple R-squared: 0.9759, Adjusted R-squared: 0.9757
## F-statistic: 7652 on 3 and 568 DF, p-value: < 2.2e-16
# create predicted column
model <- predict(lm,se=TRUE)
olive <-olive %>%
mutate(predicted = model$fit) %>%
# set the residual boundary lines at the 96th percentile on either side of the regression line.
mutate(upperSe= predicted + 1.96*model$se.fit) %>%
mutate(lowerSe= predicted - 1.96*model$se.fit)
# lm<-lm(oleic ~ palmitic + linoleic + palmitoleic, olive)
plot1<-olive %>%
ggplot(aes(y=predicted,
x=oleic,
color=area,
labelR=region,
labelP1=palmitic,
labelP2=palmitoleic,
labelL=linoleic
)
)+
theme_classic(base_size = 12)+
geom_point()+
# plot regression line with standard error:
geom_smooth(aes(ymin=lowerSe,ymax=upperSe),
color="black",
se=TRUE,
method = 'loess',
formula = 'y ~ x',
stat="smooth"
)+
scale_fill_distiller(palette = "Set2")+
theme(
plot.title = element_text( size=10, face="plain",lineheight =.8,hjust = .5 ,vjust = 1),
axis.title.x = element_text(size=12, face="plain"),
legend.title = element_blank()
)+
labs(title="\nOleic Acid ~ Palmitic + Linoleic + Palmitoleic",
x="Percentage Oleic Acid",
y="Predicted Percentage Oleic Acid"
)
ggplotly(plot1)
Fatty Acid Content of Italian Olive Oils
lm<-lm(linolenic ~ arachidic + eicosenoic, olive)
summary(lm)
##
## Call:
## lm(formula = linolenic ~ arachidic + eicosenoic, data = olive)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16907 -0.06204 -0.01048 0.04701 0.40868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09100 0.01044 8.718 <2e-16 ***
## arachidic 0.28389 0.01769 16.052 <2e-16 ***
## eicosenoic 0.38659 0.02767 13.974 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08793 on 569 degrees of freedom
## Multiple R-squared: 0.5419, Adjusted R-squared: 0.5403
## F-statistic: 336.5 on 2 and 569 DF, p-value: < 2.2e-16
# create predicted column
model <- predict(lm,se=TRUE)
olive <-olive %>%
mutate(predicted = model$fit) %>%
mutate(upperSe= predicted + 1.96*model$se.fit) %>%
mutate(lowerSe= predicted - 1.96*model$se.fit)
plot2<-olive %>%
ggplot(aes(y=predicted,
x=linolenic,
color=area,
labelR=region,
labelP1=arachidic,
labelP2=eicosenoic,
)
)+
theme_classic(base_size = 12)+
geom_point()+
#plot regression line
geom_smooth(aes(ymin=lowerSe,ymax=upperSe),
color="black",
se=TRUE,
method = 'loess',
formula = 'y ~ x',
stat="smooth"
)+
scale_fill_distiller(palette = "Set2")+
theme(
plot.title = element_text( size=10, face="plain",lineheight =.8,hjust = .5,vjust = 1),
axis.title.x = element_text(size=12, face="plain"),
legend.title = element_blank()
)+
labs(title="\nLinolenic Acid ~ arachidic + eicosenoic",
x="Percentage Linolenic Acid",
y="Predicted Percentage Linolenic Acid"
)
ggplotly(plot2)
Fatty Acid Content of Italian Olive Oils
lm<-lm(palmitoleic ~ palmitic + linoleic, olive)
summary(lm)
##
## Call:
## lm(formula = palmitoleic ~ palmitic + linoleic, data = olive)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81194 -0.16714 -0.01239 0.14850 1.10347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.049903 0.078348 -26.16 <2e-16 ***
## palmitic 0.217086 0.007063 30.73 <2e-16 ***
## linoleic 0.064956 0.004904 13.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2526 on 569 degrees of freedom
## Multiple R-squared: 0.7693, Adjusted R-squared: 0.7685
## F-statistic: 948.9 on 2 and 569 DF, p-value: < 2.2e-16
# create predicted column
model <- predict(lm,se=TRUE)
olive <-olive %>%
mutate(predicted = model$fit) %>%
mutate(upperSe= predicted + 1.96*model$se.fit) %>%
mutate(lowerSe= predicted - 1.96*model$se.fit)
plot3<-olive %>%
ggplot(aes(y=predicted,
x=palmitoleic,
color=area,
labelR=region,
labelP1=palmitic,
labelP2=linoleic
)
)+
theme_classic(base_size = 12)+
geom_point()+
#plot regression line
geom_smooth(aes(ymin=lowerSe,ymax=upperSe),
color="black",
se=TRUE,
method = 'loess',
formula = 'y ~ x',
stat="smooth"
)+
scale_fill_distiller(palette = "Set2")+
theme(
plot.title = element_text( size=10, face="plain",lineheight =.8,hjust = .5,vjust = 1),
axis.title.x = element_text(size=12, face="plain"),
legend.title = element_blank()
)+
labs(title="\nPalmitoleic ~ palmitic + linoleic",
x="Percentage Palmitoleic Acid",
y="Predicted Percentage Palmitoleic Acid"
)
ggplotly(plot3)
Fatty Acid Content of Italian Olive Oils