# eval: Whether to evaluate the code and include its results
# echo: Whether to display code along with its results
# cache: Whether to cache results for future renders
packages = c('corrplot', 'ggpubr', 'plotly', 'tidyverse')
for(p in packages){library
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
In this hands-on exercise, the Wine Quality Data Set of UCI Machine Learning Repository will be used. The data set consists of 13 variables and 6497 observations. For the purpose of this exercise, we have combined the red wine and white wine data into one data file. It is called wine_quality and is in csv file format.
Import the data into R by using read_csv() of readr package
wine <- read_csv('data/wine_quality.csv')
wine1 <- read.csv('data/wine_quality.csv')
# replace space to dot (the header)
Multiple histograms are plottted to reveal the distribution of the selected variables in the wine quality data sets. Next, the ggarange function of ggpur package is used to group these histograms together.
fa <- ggplot(data=wine, aes(x= `fixed acidity`)) +
geom_histogram(bins=20, color="black", fill="light blue")
va <- ggplot(data=wine, aes(x= `volatile acidity`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ca <- ggplot(data=wine, aes(x= `citric acid`)) +
geom_histogram(bins=20, color="black", fill="light blue")
rs <- ggplot(data=wine, aes(x= `residual sugar`)) +
geom_histogram(bins=20, color="black", fill="light blue")
ch <- ggplot(data=wine, aes(x= `chlorides`)) +
geom_histogram(bins=20, color="black", fill="light blue")
fSO2 <- ggplot(data=wine, aes(x= `free sulfur dioxide`)) +
geom_histogram(bins=20, color="black", fill="light blue")
tSO2 <- ggplot(data=wine, aes(x= `total sulfur dioxide`)) +
geom_histogram(bins=20, color="black", fill="light blue")
density <- ggplot(data=wine, aes(x= density)) +
geom_histogram(bins=20, color="black", fill="light blue")
pH <- ggplot(data=wine, aes(x= pH)) +
geom_histogram(bins=20, color="black", fill="light blue")
sulphates <- ggplot(data=wine, aes(x= sulphates)) +
geom_histogram(bins=20, color="black", fill="light blue")
alcohol <- ggplot(data=wine, aes(x= alcohol)) +
geom_histogram(bins=20, color="black", fill="light blue")
ggarrange(fa, va, ca, rs, ch, fSO2, tSO2, density, pH, sulphates, alcohol, ncol = 4, nrow = 3)
# not trolly, but plot one by one and arrange together
wine.cor = cor(wine[,2:12])
corrplot(wine.cor)
corrplot(wine.cor,
method = 'ellipse', # the shape is changed to ellipse
type = 'lower') # only retain the lower half part
corrplot(wine.cor,
method = 'ellipse',
type = 'lower',
diag = FALSE, # remove the diagonal line
tl.col = 'black') #
corrplot.mixed(wine.cor,
lower = 'ellipse',
upper = 'number', # show corr values in upper part
tl.pos = 'lt',
tl.col = 'black')
wine.sig <- cor.mtest(wine.cor, conf.level = .95)
corrplot(wine.cor,
method = 'number',
type = 'lower',
diag = FALSE,
tl.col = 'black',
p.mat = wine.sig$p,
# set the siginificance level at 0.1
sig.level = .1,
tl.srt = 45)
#### 2.5 Combining corrgram with clustering
corrplot(wine.cor,
method = "ellipse",
tl.pos = "lt",
tl.col = "black",
order="hclust",
hclust.method = "ward.D",
addrect = 3)
build a selection in the interface provide users with choices in the shiny app
packages = c('ggtern', 'plotly', 'readr', 'dplyr', 'tidyr')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
The code chunk below will then be used to install ggplot2 version 3.2.1.
require(devtools)
# install_version("ggplot2", version = "3.2.1", repos = "http://cran.us.r-project.org")
# url = "https://cran.r-project.org/bin/windows/contrib/r-devel-gcc8/ggplot2_3.2.1.zip"
# install.packages(url, repos=NULL, type="source")
Then, the code chunk below will be used to launch ggplot2 package in RStudio.
library(ggplot2)
pop_data <- read_csv("data/respopagsex2000to2018_tidy.csv")
glimpse(pop_data)
## Observations: 108,126
## Variables: 5
## $ PA <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", ...
## $ SZ <chr> "Ang Mo Kio Town Centre", "Ang Mo Kio Town Centre", "Ang...
## $ AG <chr> "AGE0-4", "AGE0-4", "AGE0-4", "AGE0-4", "AGE0-4", "AGE0-...
## $ Year <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2011, 20...
## $ Population <dbl> 290, 270, 260, 250, 260, 250, 200, 180, 290, 290, 270, 3...
agpop_mutated <- pop_data %>%
mutate(Year = as.character(Year)) %>%
spread(AG, Population) %>%
mutate(YOUNG = rowSums(.[4:8])) %>%
mutate(ACTIVE = rowSums(.[9:16])) %>%
mutate(OLD = rowSums(.[17:21])) %>%
mutate(TOTAL = rowSums(.[22:24])) %>%
filter(Year == 2018) %>%
filter(TOTAL > 0)
# Building the static ternary plot
# Error: $ operator is invalid for atomic vectors
ggtern(data=agpop_mutated,aes(x=YOUNG,y=ACTIVE, z=OLD)) +
geom_point()
packages = c('GGally', 'plotly', 'parcoords', 'tidyverse')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
}
wh <- read_csv("data/WHData-2018.csv")
glimpse(wh)
## Observations: 156
## Variables: 12
## $ Country <chr> "Albania", "Bosnia and Herzegovina",...
## $ Region <chr> "Central and Eastern Europe", "Centr...
## $ `Happiness score` <dbl> 4.586, 5.129, 4.933, 5.321, 6.711, 5...
## $ `Whisker-high` <dbl> 4.695, 5.224, 5.022, 5.398, 6.783, 5...
## $ `Whisker-low` <dbl> 4.477, 5.035, 4.844, 5.244, 6.639, 5...
## $ Dystopia <dbl> 1.462, 1.883, 1.219, 1.769, 2.494, 1...
## $ `GDP per capita` <dbl> 0.916, 0.915, 1.054, 1.115, 1.233, 1...
## $ `Social support` <dbl> 0.817, 1.078, 1.515, 1.161, 1.489, 1...
## $ `Healthy life expectancy` <dbl> 0.790, 0.758, 0.712, 0.737, 0.854, 0...
## $ `Freedom to make life choices` <dbl> 0.419, 0.280, 0.359, 0.380, 0.543, 0...
## $ Generosity <dbl> 0.149, 0.216, 0.064, 0.120, 0.064, 0...
## $ `Perceptions of corruption` <dbl> 0.032, 0.000, 0.009, 0.039, 0.034, 0...
4.3.1 Working with ggparcoord() of GGally package
ggparcoord(wh,
columns = c(7:12),
groupColumn = 2,
scale = "uniminmax",
boxplot = TRUE,
title = "Parallel Coord. Plot of World Happines Attributes") +
facet_wrap(~Region)
4.3.2 Plotting Interactive Parallel Coordinates
parcoords(
wh[,7:12],
reorderable = T,
brushMode = '1D-axes')
packages = c('seriation', 'dendextend', 'heatmaply', 'tidyverse')
for(p in packages){library
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
row.names(wh) <- wh$Country
## Warning: Setting row names on a tibble is deprecated.
wh1 <- select(wh, c(3, 7:12)) # select column 3 and 7:12
wh_matrix <- data.matrix(wh1)
5.3.1 Static
wh_heatmap <- heatmap(wh_matrix,
scale="column",
cexRow = 0.6,
cexCol = 0.8)
6.2.2 Interactive
heatmaply(wh_matrix,
width=800,
height=1000)