DS Labs Datasets

# install.packages("dslabs")  # these are data science labs
library("dslabs")
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
##  [1] "make-admissions.R"                   
##  [2] "make-brca.R"                         
##  [3] "make-brexit_polls.R"                 
##  [4] "make-death_prob.R"                   
##  [5] "make-divorce_margarine.R"            
##  [6] "make-gapminder-rdas.R"               
##  [7] "make-greenhouse_gases.R"             
##  [8] "make-historic_co2.R"                 
##  [9] "make-mnist_27.R"                     
## [10] "make-movielens.R"                    
## [11] "make-murders-rda.R"                  
## [12] "make-na_example-rda.R"               
## [13] "make-nyc_regents_scores.R"           
## [14] "make-olive.R"                        
## [15] "make-outlier_example.R"              
## [16] "make-polls_2008.R"                   
## [17] "make-polls_us_election_2016.R"       
## [18] "make-reported_heights-rda.R"         
## [19] "make-research_funding_rates.R"       
## [20] "make-stars.R"                        
## [21] "make-temp_carbon.R"                  
## [22] "make-tissue-gene-expression.R"       
## [23] "make-trump_tweets.R"                 
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"

Loading Italian olive and the required libraries:

data("olive")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(ggrepel)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#view(olive)
write_csv(olive, "olive.cvs", na="")

Italian Olive Dataset

str(olive)
## 'data.frame':    572 obs. of  10 variables:
##  $ region     : Factor w/ 3 levels "Northern Italy",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ area       : Factor w/ 9 levels "Calabria","Coast-Sardinia",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ palmitic   : num  10.75 10.88 9.11 9.66 10.51 ...
##  $ palmitoleic: num  0.75 0.73 0.54 0.57 0.67 0.49 0.66 0.61 0.6 0.55 ...
##  $ stearic    : num  2.26 2.24 2.46 2.4 2.59 2.68 2.64 2.35 2.39 2.13 ...
##  $ oleic      : num  78.2 77.1 81.1 79.5 77.7 ...
##  $ linoleic   : num  6.72 7.81 5.49 6.19 6.72 6.78 6.18 7.34 7.09 6.33 ...
##  $ linolenic  : num  0.36 0.31 0.31 0.5 0.5 0.51 0.49 0.39 0.46 0.26 ...
##  $ arachidic  : num  0.6 0.61 0.63 0.78 0.8 0.7 0.56 0.64 0.83 0.52 ...
##  $ eicosenoic : num  0.29 0.29 0.29 0.35 0.46 0.44 0.29 0.35 0.33 0.3 ...
names(olive)
##  [1] "region"      "area"        "palmitic"    "palmitoleic" "stearic"    
##  [6] "oleic"       "linoleic"    "linolenic"   "arachidic"   "eicosenoic"

Distribution Between Fatty Acids between each region

olive <- select(olive, -area)
olive %>% 
  gather(fatty_acid, percentage, -region) %>%
  ggplot(aes(region, percentage, fill = region)) +
  geom_boxplot() +
  facet_wrap(~fatty_acid, scales = "free", ncol = 4) +
  theme(axis.text.x = element_blank(), legend.position = "bottom") +
  labs(title = "Fatty Acid Distribution between each Region",
       x = "Regions",
       y = "Percentages")

Distribution Between Fatty Acids

library(corrplot)
## corrplot 0.90 loaded
corr_olive <- select(olive, "arachidic", "eicosenoic", "linoleic", "linolenic", "oleic", "palmitic", "palmitoleic", "stearic")

corrplot.mixed(cor(corr_olive), 
               lower = "ellipse", 
               upper = "number",
               tl.pos = "lt",
               diag = "l",
               tl.col = "black")

Linear model regression

lm<-lm(linolenic ~ arachidic +  eicosenoic, olive)
summary(lm)
## 
## Call:
## lm(formula = linolenic ~ arachidic + eicosenoic, data = olive)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16907 -0.06204 -0.01048  0.04701  0.40868 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.09100    0.01044   8.718   <2e-16 ***
## arachidic    0.28389    0.01769  16.052   <2e-16 ***
## eicosenoic   0.38659    0.02767  13.974   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08793 on 569 degrees of freedom
## Multiple R-squared:  0.5419, Adjusted R-squared:  0.5403 
## F-statistic: 336.5 on 2 and 569 DF,  p-value: < 2.2e-16
lm<-lm(palmitoleic ~ palmitic + linoleic, olive)
summary(lm)
## 
## Call:
## lm(formula = palmitoleic ~ palmitic + linoleic, data = olive)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81194 -0.16714 -0.01239  0.14850  1.10347 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.049903   0.078348  -26.16   <2e-16 ***
## palmitic     0.217086   0.007063   30.73   <2e-16 ***
## linoleic     0.064956   0.004904   13.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2526 on 569 degrees of freedom
## Multiple R-squared:  0.7693, Adjusted R-squared:  0.7685 
## F-statistic: 948.9 on 2 and 569 DF,  p-value: < 2.2e-16
model <- predict(lm,se = TRUE)
olive <-olive %>%
  mutate(predicted = model$fit) %>%
  # set the residual boundary lines at the 96th percentile on either side of the regression line.
  mutate(upperSe = predicted + 1.96*model$se.fit) %>%
  mutate(lowerSe = predicted - 1.96*model$se.fit)

Plotting %linolenic vs %(arachidic, eicosenoic)

olive1 <- olive %>%
  ggplot(aes(y = predicted,
             x = linolenic, 
             color = region, 
             labelR = region,
             labelP1 = arachidic,
             labelP2 = eicosenoic,))+
  theme_classic(base_size = 12)+
  geom_point()+
  geom_smooth(aes(ymin = lowerSe,ymax = upperSe),
                color = "black",
                se = TRUE,
                method = 'loess',
                formula = 'y ~ x',
                stat="smooth")+
  scale_fill_distiller(palette = "Set2")+
  theme(plot.title = element_text( size = 13, face = "plain",lineheight = .8,hjust = .5,vjust = 1),
        axis.title.x = element_text(size = 12, face = "plain"),
        legend.title = element_blank())+
  labs(title = "Linolenic Acid ~ arachidic + eicosenoic",
          x = "Percentage Linolenic Acid",
          y = "Predicted Percentage Linolenic Acid")

ggplotly(olive1)

Plotting %palmitoleic vs %(palmitic, linoleic)

olive2 <- olive %>%
  ggplot(aes(y = predicted,
             x = palmitoleic, 
             color = region, 
             labelR = region,
             labelP1 = palmitic,
             labelP2 = linoleic))+
  theme_classic(base_size = 12)+
  geom_point()+
  geom_smooth(aes(ymin = lowerSe,ymax = upperSe),
                color = "black",
                se = TRUE,
                method = 'loess',
                formula = 'y ~ x',
                stat = "smooth")+
  scale_fill_distiller(palette = "Set2")+
  theme(plot.title = element_text( size =13, face = "plain", lineheight = .8,hjust = .5,vjust = 1),
        axis.title.x = element_text(size = 12, face = "plain"),
        legend.title = element_blank())+
  labs(title="Palmitoleic ~ palmitic + linoleic",
       x = "Percentage Palmitoleic Acid",
       y = "Predicted Percentage Palmitoleic Acid")

ggplotly(olive2)

DSLabs - Italian Olives

I used the “Italian Olives” dataset from the dslabs datasets. This dataset describes the percentage composition of fatty acids in olive oil from a variety of regions and areas of Italy collected from J. Zupan and J. Gasteiger. Neural Networks in Chemistry and Drug Design. 1994. The goal is to see the relationship between the fatty acid variables and their origins. First I created a multivariable boxplot graphs to observe the distribution of fatty acids between each regions. By observing the boxplot most of the Italian olive oil samples are from Southern Italy. Then I made a heatmap to see the correlation between the fatty acids. I picked %linolenic vs %(arachidic, eicosenoic) and %palmitoleic vs %(palmitic, linoleic) from the correlation heatmap. Fatty acid variables were chosen based if they were positively correlated and above significant correlations of 0.5. Based on the information I gathered, I plot the data on scatterplots to see the correlation between the chosen variables and their origins.