# install.packages("dslabs") # these are data science labs
library("dslabs")
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
## [1] "make-admissions.R"
## [2] "make-brca.R"
## [3] "make-brexit_polls.R"
## [4] "make-death_prob.R"
## [5] "make-divorce_margarine.R"
## [6] "make-gapminder-rdas.R"
## [7] "make-greenhouse_gases.R"
## [8] "make-historic_co2.R"
## [9] "make-mnist_27.R"
## [10] "make-movielens.R"
## [11] "make-murders-rda.R"
## [12] "make-na_example-rda.R"
## [13] "make-nyc_regents_scores.R"
## [14] "make-olive.R"
## [15] "make-outlier_example.R"
## [16] "make-polls_2008.R"
## [17] "make-polls_us_election_2016.R"
## [18] "make-reported_heights-rda.R"
## [19] "make-research_funding_rates.R"
## [20] "make-stars.R"
## [21] "make-temp_carbon.R"
## [22] "make-tissue-gene-expression.R"
## [23] "make-trump_tweets.R"
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"
data("olive")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library(ggrepel)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#view(olive)
write_csv(olive, "olive.cvs", na="")
str(olive)
## 'data.frame': 572 obs. of 10 variables:
## $ region : Factor w/ 3 levels "Northern Italy",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ area : Factor w/ 9 levels "Calabria","Coast-Sardinia",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ palmitic : num 10.75 10.88 9.11 9.66 10.51 ...
## $ palmitoleic: num 0.75 0.73 0.54 0.57 0.67 0.49 0.66 0.61 0.6 0.55 ...
## $ stearic : num 2.26 2.24 2.46 2.4 2.59 2.68 2.64 2.35 2.39 2.13 ...
## $ oleic : num 78.2 77.1 81.1 79.5 77.7 ...
## $ linoleic : num 6.72 7.81 5.49 6.19 6.72 6.78 6.18 7.34 7.09 6.33 ...
## $ linolenic : num 0.36 0.31 0.31 0.5 0.5 0.51 0.49 0.39 0.46 0.26 ...
## $ arachidic : num 0.6 0.61 0.63 0.78 0.8 0.7 0.56 0.64 0.83 0.52 ...
## $ eicosenoic : num 0.29 0.29 0.29 0.35 0.46 0.44 0.29 0.35 0.33 0.3 ...
names(olive)
## [1] "region" "area" "palmitic" "palmitoleic" "stearic"
## [6] "oleic" "linoleic" "linolenic" "arachidic" "eicosenoic"
olive <- select(olive, -area)
olive %>%
gather(fatty_acid, percentage, -region) %>%
ggplot(aes(region, percentage, fill = region)) +
geom_boxplot() +
facet_wrap(~fatty_acid, scales = "free", ncol = 4) +
theme(axis.text.x = element_blank(), legend.position = "bottom") +
labs(title = "Fatty Acid Distribution between each Region",
x = "Regions",
y = "Percentages")
library(corrplot)
## corrplot 0.90 loaded
corr_olive <- select(olive, "arachidic", "eicosenoic", "linoleic", "linolenic", "oleic", "palmitic", "palmitoleic", "stearic")
corrplot.mixed(cor(corr_olive),
lower = "ellipse",
upper = "number",
tl.pos = "lt",
diag = "l",
tl.col = "black")
lm<-lm(linolenic ~ arachidic + eicosenoic, olive)
summary(lm)
##
## Call:
## lm(formula = linolenic ~ arachidic + eicosenoic, data = olive)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16907 -0.06204 -0.01048 0.04701 0.40868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09100 0.01044 8.718 <2e-16 ***
## arachidic 0.28389 0.01769 16.052 <2e-16 ***
## eicosenoic 0.38659 0.02767 13.974 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08793 on 569 degrees of freedom
## Multiple R-squared: 0.5419, Adjusted R-squared: 0.5403
## F-statistic: 336.5 on 2 and 569 DF, p-value: < 2.2e-16
lm<-lm(palmitoleic ~ palmitic + linoleic, olive)
summary(lm)
##
## Call:
## lm(formula = palmitoleic ~ palmitic + linoleic, data = olive)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81194 -0.16714 -0.01239 0.14850 1.10347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.049903 0.078348 -26.16 <2e-16 ***
## palmitic 0.217086 0.007063 30.73 <2e-16 ***
## linoleic 0.064956 0.004904 13.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2526 on 569 degrees of freedom
## Multiple R-squared: 0.7693, Adjusted R-squared: 0.7685
## F-statistic: 948.9 on 2 and 569 DF, p-value: < 2.2e-16
model <- predict(lm,se = TRUE)
olive <-olive %>%
mutate(predicted = model$fit) %>%
# set the residual boundary lines at the 96th percentile on either side of the regression line.
mutate(upperSe = predicted + 1.96*model$se.fit) %>%
mutate(lowerSe = predicted - 1.96*model$se.fit)
olive1 <- olive %>%
ggplot(aes(y = predicted,
x = linolenic,
color = region,
labelR = region,
labelP1 = arachidic,
labelP2 = eicosenoic,))+
theme_classic(base_size = 12)+
geom_point()+
geom_smooth(aes(ymin = lowerSe,ymax = upperSe),
color = "black",
se = TRUE,
method = 'loess',
formula = 'y ~ x',
stat="smooth")+
scale_fill_distiller(palette = "Set2")+
theme(plot.title = element_text( size = 13, face = "plain",lineheight = .8,hjust = .5,vjust = 1),
axis.title.x = element_text(size = 12, face = "plain"),
legend.title = element_blank())+
labs(title = "Linolenic Acid ~ arachidic + eicosenoic",
x = "Percentage Linolenic Acid",
y = "Predicted Percentage Linolenic Acid")
ggplotly(olive1)
olive2 <- olive %>%
ggplot(aes(y = predicted,
x = palmitoleic,
color = region,
labelR = region,
labelP1 = palmitic,
labelP2 = linoleic))+
theme_classic(base_size = 12)+
geom_point()+
geom_smooth(aes(ymin = lowerSe,ymax = upperSe),
color = "black",
se = TRUE,
method = 'loess',
formula = 'y ~ x',
stat = "smooth")+
scale_fill_distiller(palette = "Set2")+
theme(plot.title = element_text( size =13, face = "plain", lineheight = .8,hjust = .5,vjust = 1),
axis.title.x = element_text(size = 12, face = "plain"),
legend.title = element_blank())+
labs(title="Palmitoleic ~ palmitic + linoleic",
x = "Percentage Palmitoleic Acid",
y = "Predicted Percentage Palmitoleic Acid")
ggplotly(olive2)
DSLabs - Italian Olives
I used the “Italian Olives” dataset from the dslabs datasets. This dataset describes the percentage composition of fatty acids in olive oil from a variety of regions and areas of Italy collected from J. Zupan and J. Gasteiger. Neural Networks in Chemistry and Drug Design. 1994. The goal is to see the relationship between the fatty acid variables and their origins. First I created a multivariable boxplot graphs to observe the distribution of fatty acids between each regions. By observing the boxplot most of the Italian olive oil samples are from Southern Italy. Then I made a heatmap to see the correlation between the fatty acids. I picked %linolenic vs %(arachidic, eicosenoic) and %palmitoleic vs %(palmitic, linoleic) from the correlation heatmap. Fatty acid variables were chosen based if they were positively correlated and above significant correlations of 0.5. Based on the information I gathered, I plot the data on scatterplots to see the correlation between the chosen variables and their origins.