Visualising and Analysing Multidimensional Data

1. Load packages

# eval: Whether to evaluate the code and include its results
# echo: Whether to display code along with its results
# cache: Whether to cache results for future renders

packages = c('corrplot', 'ggpubr', 'plotly', 'tidyverse')

for(p in packages){library
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

2. Corrplot

2.1 Load data

In this hands-on exercise, the Wine Quality Data Set of UCI Machine Learning Repository will be used. The data set consists of 13 variables and 6497 observations. For the purpose of this exercise, we have combined the red wine and white wine data into one data file. It is called wine_quality and is in csv file format.

Import the data into R by using read_csv() of readr package

wine <- read_csv('data/wine_quality.csv')

wine1 <- read.csv('data/wine_quality.csv')
# replace space to dot (the header)

2.2 Univariate EDA with Histogram

Multiple histograms are plottted to reveal the distribution of the selected variables in the wine quality data sets. Next, the ggarange function of ggpur package is used to group these histograms together.

fa <- ggplot(data=wine, aes(x= `fixed acidity`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
va <- ggplot(data=wine, aes(x= `volatile acidity`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
ca <- ggplot(data=wine, aes(x= `citric acid`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
rs <- ggplot(data=wine, aes(x= `residual sugar`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
ch <- ggplot(data=wine, aes(x= `chlorides`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
fSO2 <- ggplot(data=wine, aes(x= `free sulfur dioxide`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
tSO2 <- ggplot(data=wine, aes(x= `total sulfur dioxide`)) +
  geom_histogram(bins=20, color="black", fill="light blue")
density <- ggplot(data=wine, aes(x= density)) +
  geom_histogram(bins=20, color="black", fill="light blue")
pH <- ggplot(data=wine, aes(x= pH)) +
  geom_histogram(bins=20, color="black", fill="light blue")
sulphates <- ggplot(data=wine, aes(x= sulphates)) +
  geom_histogram(bins=20, color="black", fill="light blue")
alcohol <- ggplot(data=wine, aes(x= alcohol)) +
  geom_histogram(bins=20, color="black", fill="light blue")

ggarrange(fa, va, ca, rs, ch, fSO2, tSO2, density, pH, sulphates, alcohol, ncol = 4, nrow = 3)

# not trolly, but plot one by one and arrange together

2.3 Visualising Correlation Matrix using corrplot Package

wine.cor = cor(wine[,2:12])

corrplot(wine.cor)

corrplot(wine.cor,
         method = 'ellipse', # the shape is changed to ellipse
         type = 'lower') # only retain the lower half part

corrplot(wine.cor,
         method = 'ellipse',
         type = 'lower',
         diag = FALSE, # remove the diagonal line
         tl.col = 'black') #

corrplot.mixed(wine.cor,
               lower = 'ellipse',
               upper = 'number', # show corr values in upper part
               tl.pos = 'lt',
               tl.col = 'black')

2.4 Combining corrgram with the significance test

wine.sig <- cor.mtest(wine.cor, conf.level = .95)

corrplot(wine.cor,
         method = 'number',
         type = 'lower',
         diag = FALSE,
         tl.col = 'black',
         p.mat = wine.sig$p,
         # set the siginificance level at 0.1
         sig.level = .1,
         tl.srt = 45)

#### 2.5 Combining corrgram with clustering

corrplot(wine.cor, 
         method = "ellipse", 
         tl.pos = "lt",
         tl.col = "black",
         order="hclust",
         hclust.method = "ward.D",
         addrect = 3)

build a selection in the interface provide users with choices in the shiny app

3. Ternary Plot

packages = c('ggtern', 'plotly', 'readr', 'dplyr', 'tidyr')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

The code chunk below will then be used to install ggplot2 version 3.2.1.

require(devtools)
# install_version("ggplot2", version = "3.2.1", repos = "http://cran.us.r-project.org")

# url = "https://cran.r-project.org/bin/windows/contrib/r-devel-gcc8/ggplot2_3.2.1.zip"
# install.packages(url, repos=NULL, type="source")

Then, the code chunk below will be used to launch ggplot2 package in RStudio.

library(ggplot2)

3.1 Import data

pop_data <- read_csv("data/respopagsex2000to2018_tidy.csv") 
glimpse(pop_data)

## Observations: 108,126
## Variables: 5
## $ PA         <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", ...
## $ SZ         <chr> "Ang Mo Kio Town Centre", "Ang Mo Kio Town Centre", "Ang...
## $ AG         <chr> "AGE0-4", "AGE0-4", "AGE0-4", "AGE0-4", "AGE0-4", "AGE0-...
## $ Year       <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2011, 20...
## $ Population <dbl> 290, 270, 260, 250, 260, 250, 200, 180, 290, 290, 270, 3...

3.2 Prepare data

agpop_mutated <- pop_data %>%
  mutate(Year = as.character(Year)) %>%
  spread(AG, Population) %>%
  mutate(YOUNG = rowSums(.[4:8])) %>%
  mutate(ACTIVE = rowSums(.[9:16])) %>%
  mutate(OLD = rowSums(.[17:21])) %>%
  mutate(TOTAL = rowSums(.[22:24])) %>%
  filter(Year == 2018) %>%
  filter(TOTAL > 0)

3.3 Ternary plot

# Building the static ternary plot
# Error: $ operator is invalid for atomic vectors
ggtern(data=agpop_mutated,aes(x=YOUNG,y=ACTIVE, z=OLD)) +
  geom_point()

4. Parallel coordinates

4.1 Install packages

packages = c('GGally', 'plotly', 'parcoords', 'tidyverse')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
}

4.2 Load data

wh <- read_csv("data/WHData-2018.csv")
glimpse(wh)

## Observations: 156
## Variables: 12
## $ Country                        <chr> "Albania", "Bosnia and Herzegovina",...
## $ Region                         <chr> "Central and Eastern Europe", "Centr...
## $ `Happiness score`              <dbl> 4.586, 5.129, 4.933, 5.321, 6.711, 5...
## $ `Whisker-high`                 <dbl> 4.695, 5.224, 5.022, 5.398, 6.783, 5...
## $ `Whisker-low`                  <dbl> 4.477, 5.035, 4.844, 5.244, 6.639, 5...
## $ Dystopia                       <dbl> 1.462, 1.883, 1.219, 1.769, 2.494, 1...
## $ `GDP per capita`               <dbl> 0.916, 0.915, 1.054, 1.115, 1.233, 1...
## $ `Social support`               <dbl> 0.817, 1.078, 1.515, 1.161, 1.489, 1...
## $ `Healthy life expectancy`      <dbl> 0.790, 0.758, 0.712, 0.737, 0.854, 0...
## $ `Freedom to make life choices` <dbl> 0.419, 0.280, 0.359, 0.380, 0.543, 0...
## $ Generosity                     <dbl> 0.149, 0.216, 0.064, 0.120, 0.064, 0...
## $ `Perceptions of corruption`    <dbl> 0.032, 0.000, 0.009, 0.039, 0.034, 0...

4.3 Plot

4.3.1 Working with ggparcoord() of GGally package

ggparcoord(wh,
           columns = c(7:12),
           groupColumn = 2,
           scale = "uniminmax", 
           boxplot = TRUE, 
           title = "Parallel Coord. Plot of World Happines Attributes") +
  facet_wrap(~Region)

4.3.2 Plotting Interactive Parallel Coordinates

parcoords(
 wh[,7:12],
 reorderable = T,
 brushMode = '1D-axes')

5. Heatmap

5.1 Load packages

packages = c('seriation', 'dendextend', 'heatmaply', 'tidyverse')

for(p in packages){library
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

5.2 Data Preparation

row.names(wh) <- wh$Country

## Warning: Setting row names on a tibble is deprecated.

wh1 <- select(wh, c(3, 7:12)) # select column 3 and 7:12
wh_matrix <- data.matrix(wh1)

5.3 PlotHeatmap

5.3.1 Static

wh_heatmap <- heatmap(wh_matrix,
                      scale="column",
                      cexRow = 0.6, 
                      cexCol = 0.8)

6.2.2 Interactive

heatmaply(wh_matrix,
          width=800,
          height=1000)