0.1 Data Visualization using GGplot2

Load the ggplot2 library. If not installed install it using - install.packages(“ggplot2”)

library(ggplot2)

0.1.1 Barplot

# Ggplot2 library
library(ggplot2)
 
# Use the mtcars dataset.
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# Basic barplot:
ggplot(mtcars, aes(x=as.factor(cyl) )) + geom_bar() + xlab("Number of Cylinders") + ylab("Count")

# 1: uniform color. Color is for the border, fill is for the inside
ggplot(mtcars, aes(x=as.factor(cyl) )) +
  geom_bar(color="blue", fill=rgb(0.1,0.4,0.5,0.7) )

# 2: Using Hue
ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) + geom_bar( ) +
  scale_fill_hue(c = 40)

# 3: Using RColorBrewer
ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) + geom_bar( ) +
  scale_fill_brewer(palette = "Set1")

# 4: Using greyscale:
ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) + geom_bar( ) +
  scale_fill_grey(start = 0.25, end = 0.75)

# 5: Set manualy
ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) +  geom_bar( ) +
  scale_fill_manual(values = c("red", "green", "blue") )

# 6: horizontal barplot
ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl) )) +
  geom_bar() + 
  coord_flip()

# 7: Custom bar width
ggplot(mtcars, aes(x=as.factor(cyl), fill=as.factor(cyl)  )) +
  geom_bar(width=0.4) 

0.1.2 Grouped and Stacked Barplots

# library
library(ggplot2)
 
# create a dataset
specie=c(rep("apple" , 3) , rep("mango" , 3) , rep("banana" , 3) , rep("peach" , 3) )
condition=rep(c("normal" , "stress" , "Nitrogen") , 4)
value=abs(rnorm(12 , 0 , 15))
data=data.frame(specie,condition,value)
 
head(data)
##   specie condition     value
## 1  apple    normal 21.217761
## 2  apple    stress 18.149166
## 3  apple  Nitrogen 26.155190
## 4  mango    normal 15.827057
## 5  mango    stress  4.615567
## 6  mango  Nitrogen  4.948118
# Grouped
ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar(position="dodge", stat="identity")

# Stacked
ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar( stat="identity")

# Stacked Percent
ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar( stat="identity", position="fill")

# color with RcolorBrewer
ggplot(data, aes(fill=condition, y=value, x=specie)) + 
    geom_bar( stat="identity", position="fill") +    
    scale_fill_brewer(palette = "Set1")

# Faceting
ggplot(data, aes(y=value, x=specie, color=specie, fill=specie)) + 
    geom_bar( stat="identity") +    
    facet_wrap(~condition)

0.1.3 Histograms

# library
library(ggplot2)
 
# dataset:
data=data.frame(value=rnorm(10000))

head(data)
##        value
## 1 -1.9866024
## 2  0.5871844
## 3 -0.2282852
## 4  0.3897893
## 5  0.7204838
## 6  0.3417003
# Basic histogram
ggplot(data, aes(x=value)) + geom_histogram()

# Custom Binning. I can just give the size of the bin
ggplot(data, aes(x=value)) + geom_histogram(binwidth = 0.05)

# Uniform color
ggplot(data, aes(x=value)) + 
    geom_histogram(binwidth = 0.2, color="white", fill=rgb(0.2,0.7,0.1,0.4) ) 

# Proportional color
ggplot(data, aes(x=value)) + 
    geom_histogram(binwidth = 0.2, aes(fill = ..count..) )

0.1.4 Scatterplots

# library
library(ggplot2)
 
# The iris dataset is proposed by R
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# basic scatterplot
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width)) + 
    geom_point()

# use options!
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width)) + 
    geom_point(
        color="black",
        fill="blue",
        shape=21,
        alpha=0.5,
        size=2,
        stroke = 2
        )

# Color and shape depend on factor (categorical variable)
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species,shape=Species)) + 
    geom_point(size=6, alpha=0.6)

# Color and shape depend on factor (categorical variable)
ggplot(iris, aes(x=Sepal.Length, y=Sepal.Width, color=Petal.Length, size=Petal.Length)) + 
    geom_point(alpha=0.6)

data=data.frame(cond = rep(c("condition_1", "condition_2"), each=10), my_x = 1:100 + rnorm(100,sd=9), my_y = 1:100 + rnorm(100,sd=16) )

head(data)
##          cond       my_x       my_y
## 1 condition_1 -7.5574224  -2.438493
## 2 condition_1 -0.6726812  -3.488034
## 3 condition_1  6.6869147 -14.094291
## 4 condition_1  7.4804401  16.536489
## 5 condition_1 -4.1656649  -1.750653
## 6 condition_1 15.3304614 -14.875692
# Add a linear trend :
ggplot(data, aes(x=my_x, y=my_y)) +    geom_point(shape=1) +  geom_smooth(method=lm , color="red", se=FALSE)  # Add linear regression line 

# Add a linear trend :
ggplot(data, aes(x=my_x, y=my_y)) +    geom_point(shape=1) +  geom_smooth(method=lm , color="red", se=TRUE)  # Add linear regression line with confidence interval

0.1.5 Text

# library
library(ggplot2)
 
# The mtcars dataset is proposed in R
data=head(mtcars, 30)

head(data)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# add text with geom_text, use nudge to nudge the text
ggplot(data, aes(x=wt, y=mpg)) +
  geom_point() + 
  geom_text(label=rownames(data), nudge_x = 0.25, nudge_y = 0.25, check_overlap = T)

# to improve readability, use geom_label
ggplot(data, aes(x=wt, y=mpg)) +
  geom_point() + 
  geom_label(label=rownames(data), nudge_x = 0.25, nudge_y = 0.2)

# custom geom_label like any other geom.
ggplot(data, aes(x=wt, y=mpg, fill=cyl)) +
  geom_label(label=rownames(data), color="white", size=5)

0.1.6 Boxplots

# library
library(ggplot2)
 
# The mtcars dataset is proposed in R
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# A really basic boxplot.
ggplot(mtcars, aes(x=as.factor(cyl), y=mpg)) + 
    geom_boxplot(fill="slateblue", alpha=0.2) + 
    xlab("cyl")

# The mtcars dataset is proposed in R
head(mpg)
## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl      trans   drv   cty   hwy    fl
##          <chr> <chr> <dbl> <int> <int>      <chr> <chr> <int> <int> <chr>
## 1         audi    a4   1.8  1999     4   auto(l5)     f    18    29     p
## 2         audi    a4   1.8  1999     4 manual(m5)     f    21    29     p
## 3         audi    a4   2.0  2008     4 manual(m6)     f    20    31     p
## 4         audi    a4   2.0  2008     4   auto(av)     f    21    30     p
## 5         audi    a4   2.8  1999     6   auto(l5)     f    16    26     p
## 6         audi    a4   2.8  1999     6 manual(m5)     f    18    26     p
## # ... with 1 more variables: class <chr>
# Set a different color for each group
ggplot(mpg, aes(x=class, y=hwy, fill=class)) + 
    geom_boxplot(alpha=0.3) +
    theme(legend.position="none")

# create a data frame
variety=rep(LETTERS[1:7], each=40)
treatment=rep(c("high","low"),each=20)
note=seq(1:280)+sample(1:150, 280, replace=T)
data=data.frame(variety, treatment ,  note)
 
head(data)
##   variety treatment note
## 1       A      high   98
## 2       A      high   90
## 3       A      high  100
## 4       A      high  148
## 5       A      high   98
## 6       A      high   70
# grouped boxplot
ggplot(data, aes(x=variety, y=note, fill=treatment)) + 
    geom_boxplot()

# One box per treatment
ggplot(data, aes(x=variety, y=note, fill=treatment)) + 
    geom_boxplot() +
    facet_wrap(~treatment)

# one box per variety
ggplot(data, aes(x=variety, y=note, fill=treatment)) + 
    geom_boxplot() +
    facet_wrap(~variety, scale="free")

library(ggplot2)
 
# create data
names=c(rep("A", 20) , rep("B", 8) , rep("C", 30), rep("D", 80))
value=c( sample(2:5, 20 , replace=T) , sample(4:10, 8 , replace=T), sample(1:7, 30 , replace=T), sample(3:8, 80 , replace=T) )
data=data.frame(names,value)

head(data)
##   names value
## 1     A     2
## 2     A     4
## 3     A     2
## 4     A     2
## 5     A     5
## 6     A     3
# plot
ggplot(data, aes(x=names, y=value, fill=names)) +
    geom_boxplot(alpha=0.4) +
    stat_summary(fun.y=mean, geom="point", shape=20, size=10, color="red", fill="red") +
    theme(legend.position="none") +
    scale_fill_brewer(palette="Set3")

0.1.7 Density plots

# ggplot2 library
library(ggplot2)
 
# Let's use the diamonds dataset
data(diamonds)
head(diamonds)
## # A tibble: 6 x 10
##   carat       cut color clarity depth table price     x     y     z
##   <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23     Ideal     E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium     E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good     E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium     I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good     J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336  3.94  3.96  2.48
# plot 1: Density of price for each type of cut of the diamond:
ggplot(data=diamonds,aes(x=price, group=cut, fill=cut)) + 
    geom_density(adjust=1.5)

# plot 2: Density plot with transparency (using the alpha argument):
ggplot(data=diamonds,aes(x=price, group=cut, fill=cut)) + 
    geom_density(adjust=1.5 , alpha=0.2)

0.1.8 Correlation plots

#install.packages("GGally")
library("GGally")

# Prepare some data
df <- mtcars[, c(1,3,4,5,6,7)]

head (df)
##                    mpg disp  hp drat    wt  qsec
## Mazda RX4         21.0  160 110 3.90 2.620 16.46
## Mazda RX4 Wag     21.0  160 110 3.90 2.875 17.02
## Datsun 710        22.8  108  93 3.85 2.320 18.61
## Hornet 4 Drive    21.4  258 110 3.08 3.215 19.44
## Hornet Sportabout 18.7  360 175 3.15 3.440 17.02
## Valiant           18.1  225 105 2.76 3.460 20.22
# Correlation plot 1
ggcorr(df, palette = "RdBu", label = TRUE)

# Correlation plot 2
ggpairs(df)

These are some of the plots which you will need for exploratory data analysis. If you wish to learn more I am including few resources. Please check it out

Advanced Plots Documentation GGplot2 cheatsheet GGPlot2 book