setwd("C:/Users/maiam/Dropbox/PROFESSIONAL DEVELOPMENT/DATA SCIENCE/01_R/Data Visualization with ggplot2 (Part 1)")

Exploring ggplot2

Basic ggplot2 commands.build a plot of the mtcars dataset that contains information about 32 cars from a 1973 Motor Trend magazine.

# Load the ggplot2 package
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

# Explore the mtcars data frame with str()
str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

mtcars$am <-factor(mtcars$am)

cyl (the number of cylinders) is categorical, it is classified as numeric in mtcars.

mtcars$cyl<-factor(mtcars$cyl)
ggplot(mtcars, aes(x = factor(cyl), y = mpg)) +
  geom_point()

#Grammar of Graphics **aestetics: using color and size

# A scatter plot 
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point()

# Replace ___ with the correct column
ggplot(mtcars, aes(x = wt, y = mpg, color = disp)) +
  geom_point()

# Replace ___ with the correct column
ggplot(mtcars, aes(x = wt, y = mpg, size = disp)) +
  geom_point()

Exploring ggplot2: using geom:point() and geom_smooth() The diamonds data frame contains information on the prices and various metrics of 50,000 diamonds. Among the variables included are carat (a measurement of the size of the diamond) and price. For the next exercises, you’ll be using a subset of 1,000 diamonds.

# Explore the diamonds data frame with str()
str(diamonds)

## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

# Add geom_point() with +
ggplot(diamonds, aes(x = carat, y = price))+
geom_point()

# Add geom_point() and geom_smooth() with +
ggplot(diamonds, aes(x = carat, y = price))+
geom_point()+
geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Exploring ggplot2, part 5: more possibilities of combining geoms, using alpha

# 2 - show only the smooth line
ggplot(diamonds, aes(x = carat, y = price)) +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# 3 - col in aes()
ggplot(diamonds, aes(x = carat, y = price, color=clarity)) +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# 4 - Keep the color settings from previous command. Plot only the points with argument alpha.
ggplot(diamonds, aes(x = carat, y = price, color=clarity)) +
  geom_point(alpha=0.4)

Understanding the grammar: adding layeats to to build beautiful & informative plots

# Create the object containing the data and aes layers: dia_plot
dia_plot <- ggplot(diamonds, aes(x = carat, y = price))+
geom_point()

# Add a geom layer with + and geom_point()
dia_plot <- ggplot(diamonds, aes(x = carat, y = price))+
geom_point()

# Add the same geom layer, but with aes() inside
dia_plot <- ggplot(diamonds, aes(x = carat, y = price))+
geom_point(aes(colour=clarity))

Understanding the grammar: explore mixing arguments and aesthetics in a single geometry**

# 1 - Create dia_plot object
dia_plot <- ggplot(diamonds, aes(x = carat, y = price))

# 2 - add geom_point() with alpha set to 0.2
dia_plot <- dia_plot + geom_point(alpha=0.2)

# 3 - add geom_smooth() with se set to FALSE
dia_plot + geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# 4 - add aes() with the correct mapping to geom_smooth()
dia_plot + geom_smooth(aes(col =clarity), se =FALSE)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Objects and Layers

base package and ggplot2
Make a plot of mpg (miles per gallon) against wt (weight in thousands of pounds) in the mtcars data frame, but this time you want the dots colored according to the number of cylinders, cyl usin a base ggplot2 package.

plot(mtcars$wt, mtcars$mpg, col= mtcars$cyl)

# Change cyl inside mtcars to a factor
mtcars$fcyl <- as.factor(mtcars$cyl)

# Make the same plot as in the first instruction
plot(mtcars$wt, mtcars$mpg,col=mtcars$fcyl)

base package and ggplot2: creating linear models with lm for each subset of data

# Use lm() to calculate a linear model and save it as carModel
carModel <- lm(mpg ~ wt, data = mtcars)

# Basic plot
mtcars$cyl <- as.factor(mtcars$cyl)
plot(mtcars$wt, mtcars$mpg, col = mtcars$cyl)

# Call abline() with carModel as first argument and set lty to 2
abline(carModel, lty = 2)

# Plot each subset efficiently with lapply
# You don't have to edit this code
plot(mtcars$wt, mtcars$mpg, col = mtcars$cyl)
lapply(mtcars$cyl, function(x) {
  abline(lm(mpg ~ wt, mtcars, subset = (cyl == x)), col = x)
  })

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL
## 
## [[25]]
## NULL
## 
## [[26]]
## NULL
## 
## [[27]]
## NULL
## 
## [[28]]
## NULL
## 
## [[29]]
## NULL
## 
## [[30]]
## NULL
## 
## [[31]]
## NULL
## 
## [[32]]
## NULL

# This code will draw the legend of the plot
# You don't have to edit this code
legend(x = 5, y = 33, legend = levels(mtcars$cyl),
       col = 1:3, pch = 1, bty = "n")

# Plot 1: add geom_point() to this command to create a scatter plot
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
  geom_point()  # Fill in using instructions Plot 1

# Plot 2: include the lines of the linear models, per cyl
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
  geom_point() + # Copy from Plot 1
  geom_smooth(method="lm", se=FALSE)   # Fill in using instructions Plot 2

# Plot 3: include a lm for the entire dataset in its whole
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
  geom_point() + # Copy from Plot 2
  geom_smooth(method="lm", se=FALSE) + # Copy from Plot 2
  geom_smooth(method="lm", se=FALSE,linetype=2)   # Fill in using instructions Plot 3

Tidy Data: using gather() and separate()

The resulting iris.tidy data should look as follows:

  Species  Part Measure Value
1  setosa Sepal  Length   5.1
2  setosa Sepal  Length   4.9
3  setosa Sepal  Length   4.7
4  setosa Sepal  Length   4.6
5  setosa Sepal  Length   5.0
6  setosa Sepal  Length   5.4
...

# Load the tidyr package
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.4

# Fill in the ___ to produce to the correct iris.tidy dataset
iris.tidy <- iris %>%
  gather(key, Value, -Species) %>%
  separate(key, c("Part", "Measure"), "\\.")

# Consider the structure of iris, iris.wide and iris.tidy 
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

str(iris.tidy)

## 'data.frame':    600 obs. of  4 variables:
##  $ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Part   : chr  "Sepal" "Sepal" "Sepal" "Sepal" ...
##  $ Measure: chr  "Length" "Length" "Length" "Length" ...
##  $ Value  : num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...

ggplot(iris.tidy, aes(x = Species, y = Value, col = Part)) +
  geom_jitter() +
  facet_grid(. ~ Measure)

Produce iris.wide.

The head of the iris.wide should look like this in the end:

Species Part Length Width 1 setosa Petal 1.4 0.2 2 setosa Petal 1.4 0.2 3 setosa Petal 1.3 0.2 4 setosa Petal 1.5 0.2 5 setosa Petal 1.4 0.2 6 setosa Petal 1.7 0.4 …

# Add column with unique ids
iris$Flower <- 1:nrow(iris)

# Fill in the ___ to produce to the correct iris.wide dataset
iris.wide <- iris %>%
  gather(key, value, -Species, -Flower) %>%
  separate(key, c("Part", "Measure"), "\\.") %>%
  spread(Measure, value)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Flower
## 1          5.1         3.5          1.4         0.2  setosa      1
## 2          4.9         3.0          1.4         0.2  setosa      2
## 3          4.7         3.2          1.3         0.2  setosa      3
## 4          4.6         3.1          1.5         0.2  setosa      4
## 5          5.0         3.6          1.4         0.2  setosa      5
## 6          5.4         3.9          1.7         0.4  setosa      6

head(iris.wide)

##   Species Flower  Part Length Width
## 1  setosa      1 Petal    1.4   0.2
## 2  setosa      1 Sepal    5.1   3.5
## 3  setosa      2 Petal    1.4   0.2
## 4  setosa      2 Sepal    4.9   3.0
## 5  setosa      3 Petal    1.3   0.2
## 6  setosa      3 Sepal    4.7   3.2

head(iris.tidy)

##   Species  Part Measure Value
## 1  setosa Sepal  Length   5.1
## 2  setosa Sepal  Length   4.9
## 3  setosa Sepal  Length   4.7
## 4  setosa Sepal  Length   4.6
## 5  setosa Sepal  Length   5.0
## 6  setosa Sepal  Length   5.4

ggplot(iris.wide, aes(x = Length, y = Width, color = Part)) +
  geom_jitter() +
  facet_grid(. ~ Species)

#Aesthetics Within the aes() we can consider the aesthetics: x, y, color, fill, size, alpha, labels and shape.

# 1 - Map mpg to x and cyl to y
ggplot(mtcars, aes(x=mpg, y=cyl)) +
  geom_point()

# 2 - Reverse: Map cyl to x and mpg to y
ggplot(mtcars, aes(x=cyl, y=mpg)) +
  geom_point()

# 3 - Map wt to x, mpg to y and cyl to col
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl)) +
  geom_point()

# 4 - Change shape and size of the points in the above plot
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl)) +
  geom_point(shape=1, size=4)

Aesthetics: x: X axis position y: Y axis position Color: changes the outside outline of an object
fill: typically the inside shading
exception: on geom_point(), use color, instead of fill for the inside of the point shape: by defaul shape=19(solid circle) size: Diameter of points, thickness of lines alpha: Transparency linetype: Line dash pa!ern labels: Text on a plot or axes shape: Shape

# am and cyl are factors, wt is numeric
class(mtcars$am)

## [1] "factor"

class(mtcars$cyl)

## [1] "factor"

class(mtcars$wt)

## [1] "numeric"

# From the previous exercise
ggplot(mtcars, aes(x = wt, y = mpg, col = cyl)) +
  geom_point(shape = 1, size = 4)

# 1 - Map cyl to fill
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) +
  geom_point(shape = 1, size = 4)

# 2 - Change shape and alpha of the points in the above plot
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl)) +
  geom_point(shape = 21, size = 4, alpha=0.6)

# 3 - Map am to col in the above plot
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl, color=am)) +
  geom_point(shape = 21, size = 4, alpha=0.6)

# Map cyl to size
ggplot(mtcars, aes(x=wt, y=mpg, size=cyl))+
geom_point()

## Warning: Using size for a discrete variable is not advised.

# Map cyl to alpha
ggplot(mtcars, aes(x=wt, y=mpg, alpha=cyl))+
geom_point()

## Warning: Using alpha for a discrete variable is not advised.

# Map cyl to shape 
ggplot(mtcars, aes(x=wt, y=mpg, shape=cyl))+
geom_point()

# Map cyl to label
ggplot(mtcars, aes(x=wt, y=mpg, label=cyl))+
geom_text()

Hexadecimal, literally “related to 16”, is a base-16 alphanumeric counting system. Individual values come from the ranges 0-9 and A-F. This means there are 256 possible two-digit values (i.e. 00 - FF). Hexadecimal colours use this system to specify a six-digit code for Red, Green and Blue values (“#RRGGBB”) of a colour (i.e. Pure blue: “#0000FF”, black: “#000000”, white: “#FFFFFF”). R can accept hex codes as valid colours.

# Define a hexadecimal color
my_color <- "#4ABEFF"

# Draw a scatter plot with color *aesthetic*
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl))+
geom_point()

# Same, but set color *attribute* in geom layer 

ggplot(mtcars, aes(x=wt, y=mpg))+
geom_point(color=my_color)

# Set the fill aesthetic; color, size and shape attributes
ggplot(mtcars, aes(x=wt, y=mpg,fill=cyl))+
geom_point(color=my_color, size=10,shape=23)

using geom_text()

# Expand to draw points with alpha 0.5
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl))+
geom_point(alpha=0.5)

# Expand to draw points with shape 24 and color yellow
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl))+
geom_point(shape=24, color="yellow")

# Expand to draw text with label rownames(mtcars) and color red
ggplot(mtcars, aes(x = wt, y = mpg, fill = cyl,label= rownames(mtcars)))+
geom_text(color="red")

list of all the features of the observations in mtcars:

mpg – Miles/(US) gallon cyl – Number of cylinders disp – Displacement (cu.in.) hp – Gross horsepower drat – Rear axle ratio wt – Weight (lb/1000) qsec – 1/4 mile time vs – V/S engine. am – Transmission (0 = automatic, 1 = manual) gear – Number of forward gears carb – Number of carburetors

# Map mpg onto x, qsec onto y and factor(cyl) onto col
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl)))+
geom_point()

# Add mapping: factor(am) onto shape
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl), shape=factor(am)))+
geom_point()

# Add mapping: (hp/wt) onto size
ggplot(mtcars, aes(x=mpg, y=qsec, col=factor(cyl),shape=factor(am), size=(hp/wt)))+
geom_point()

#Overplotting: Point shape and transparency deal with overplotting when you have:

Large datasets, Imprecise data and so points are not clearly separated on your plot (you saw this in the video with the iris dataset), Interval data (i.e. data appears at fixed values), or Aligned data values on a single axis.

# Basic scatter plot: wt on x-axis and mpg on y-axis; map cyl to col
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl))+
geom_point(size=4)

# Hollow circles - an improvement
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl))+
geom_point(size=4, shape=1)

# Add transparency - very nice
ggplot(mtcars, aes(x=wt, y=mpg, color=cyl))+
geom_point(size=4, shape=1, alpha=0.6)

Overplotting: alpha with large datasets

# Scatter plot: carat (x), price (y), clarity (color)
ggplot(diamonds, aes(x=carat, y=price, color=clarity))+
geom_point()

# Adjust for overplotting

ggplot(diamonds, aes(x=carat, y=price, color=clarity))+
geom_point(alpha=0.5)

# Scatter plot: clarity (x), carat (y), price (color)
ggplot(diamonds, aes(x=clarity, y=carat, color=price))+
geom_point(alpha=0.5)

# Dot plot with jittering
ggplot(diamonds, aes(x=clarity, y=carat, color=price))+
geom_point(alpha=0.5, position="jitter")

Geometries

Scatter plots and jittering: using position = position_jitter()
You already saw a few examples using geom_point() where the result was not a scatter plot. For example, in the plot shown in the viewer a continuous variable, wt, is mapped to the y aesthetic, and a categorical variable, cyl, is mapped to the x aesthetic. This also leads to over-plotting, since the points are arranged on a single x position. You previously dealt with overplotting by setting the position = jitter inside geom_point(). Let’s look at some other solutions here.

# Shown in the viewer:
ggplot(mtcars, aes(x = cyl, y = wt)) +
  geom_point()

# Solutions:
# 1 - With geom_jitter()
ggplot(mtcars, aes(x = cyl, y = wt)) +
  geom_jitter()

# 2 - Set width in geom_jitter()
ggplot(mtcars, aes(x = cyl, y = wt)) +
  geom_jitter(width=0.1)

# 3 - Set position = position_jitter() in geom_point() ()
ggplot(mtcars, aes(x = cyl, y = wt)) +
  geom_point(position = position_jitter(0.1))

Scatter plots and jittering: Use geom_jitter() Vocab dataset. The Vocab dataset contains information about the years of education and integer score on a vocabulary test for over 21,000 individuals based on US General Social Surveys from 1972-2004.

library(car)

## Warning: package 'car' was built under R version 3.4.4

## Loading required package: carData

## Warning: package 'carData' was built under R version 3.4.4

data(Vocab)
Vocab$vocabulary<-factor(Vocab$vocabulary)
Vocab$education<-factor(Vocab$education)

# Examine the structure of Vocab

str(Vocab)

## 'data.frame':    30351 obs. of  4 variables:
##  $ year      : num  1974 1974 1974 1974 1974 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
##  $ education : Factor w/ 21 levels "0","1","2","3",..: 15 17 11 11 13 17 18 11 13 12 ...
##  $ vocabulary: Factor w/ 11 levels "0","1","2","3",..: 10 10 10 6 9 9 10 6 4 6 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...

# Basic scatter plot of vocabulary (y) against education (x). Use geom_point()
ggplot(Vocab, aes(x=education, y=vocabulary))+
geom_point()

# Use geom_jitter() instead of geom_point()
ggplot(Vocab, aes(x=education, y=vocabulary))+
geom_jitter()

# Using the above plotting command, set alpha to a very low 0.2
ggplot(Vocab, aes(x=education, y=vocabulary))+
geom_jitter(alpha=0.2)

# Using the above plotting command, set the shape to 1
ggplot(Vocab, aes(x=education, y=vocabulary))+
geom_jitter(alpha=0.2, shape=1)

Histograms

Histograms are one of the most common and intuitive ways of showing distributions. I

The x axis/aesthetic: The documentation for geom_histogram() states the argument stat = “bin” as a default.

The y axis/aesthetic: geom_histogram() only requires one aesthetic: x.

# 1 - Make a univariate histogram
ggplot(mtcars, aes(x = mpg)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 2 - Plot 1, plus set binwidth to 1 in the geom layer

ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth=1)

# 3 - Plot 2, plus MAP ..density.. to the y aesthetic (i.e. in a second aes() function)
ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth=1, aes(y=..density..))

# 4 - plot 3, plus SET the fill attribute to "#377EB8"

ggplot(mtcars, aes(x = mpg)) +
  geom_histogram(binwidth=1, aes(y=..density..,fill="#377EB8"))

#Bar chart Position Argument stack: place the bars on top of each other. Counts are used. This is the default position. fill: place the bars on top of each other, but this time use proportions. dodge: place the bars next to each other. Counts are used.

# Draw a bar plot of cyl, filled according to am
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar()

# Change the position argument to stack
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position="stack")

# Change the position argument to fill
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position="fill")

# Change the position argument to dodge
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position="dodge")

Overlapping histograms pose similar problems to overlapping bar plots, but there is a unique solution here: a frequency polygon. This is a geom specific to binned data that draws a line connecting the value of each bin. Like geom_histogram(), it takes a binwidth argument and by default stat = “bin” and position = “identity”

# A basic histogram, add coloring defined by cyl
ggplot(mtcars, aes(mpg, fill = cyl)) +
  geom_histogram(binwidth = 1)

# Change position to identity
ggplot(mtcars, aes(mpg, fill = cyl)) +
  geom_histogram(binwidth = 1, position = "identity")

# Change geom to freqpoly (position is identity by default)
ggplot(mtcars, aes(mpg, color = cyl)) +
  geom_freqpoly(binwidth = 1)

Overlapping bar plots by adjustinf the dodging: using position_dodge() By adjusting the dodging, so that your bars partially overlap each other. Instead of using position = “dodge” you’re going to use position_dodge().

# 1 - The last plot form the previous exercise
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = "dodge")

# 2 - Define posn_d with position_dodge()
posn_d <- position_dodge(width=0.2)

# 3 - Change the position argument to posn_d
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = posn_d)

# 4 - Use posn_d as position and adjust alpha to 0.6

ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar(position = posn_d, alpha=0.6)

Bar plots with color ramp In this example of a bar plot, you’ll fill each segment according to an ordinal variable. The best way to do that is with a sequential color series.

# Example of how to use a brewed color palette
ggplot(mtcars, aes(x = cyl, fill = am)) +
  geom_bar() +
  scale_fill_brewer(palette = "Set1")

# Use str() on Vocab to check out the structure
str(Vocab)

## 'data.frame':    30351 obs. of  4 variables:
##  $ year      : num  1974 1974 1974 1974 1974 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
##  $ education : Factor w/ 21 levels "0","1","2","3",..: 15 17 11 11 13 17 18 11 13 12 ...
##  $ vocabulary: Factor w/ 11 levels "0","1","2","3",..: 10 10 10 6 9 9 10 6 4 6 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...

# Plot education on x and vocabulary on fill
# Use the default brewed color palette
ggplot(Vocab, aes(x=education, fill=vocabulary))+
geom_bar(position="fill")+
scale_fill_brewer()

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

Bar plots with color ramp For continuous data, the default RColorBrewer palette that scale_fill_brewer() calls is “Blues”. There are only 9 colours in the palette, and since you have 11 categories, your plot looked strange.We can manually create a color palette that can generate all the colours you need. To do this you’ll use a function called colorRampPalette().

library(RColorBrewer)

## Warning: package 'RColorBrewer' was built under R version 3.4.4

# Final plot of last exercise
ggplot(Vocab, aes(x = education, fill = vocabulary)) +
  geom_bar(position = "fill") +
  scale_fill_brewer()

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

# Definition of a set of blue colors
blues <- brewer.pal(9, "Blues") # from the RColorBrewer package

# 1 - Make a color range using colorRampPalette() and the set of blues
blue_range <- colorRampPalette(blues)

# 2 - Use blue_range to adjust the color of the bars, use scale_fill_manual()
ggplot(Vocab, aes(x = education, fill = vocabulary)) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = blue_range(11))

#Multiple time series

The dataset you’ll use contains the global capture rates of seven salmon species from 1950 - 2010.

fish.species: Each variable (column) is a Salmon Species and each observation (row) is one Year. Tidy data gather() function of the tidyr package

library(ggplot2)
data(economics)
recess <- data.frame(
  begin = c("1969-12-01","1973-11-01","1980-01-01","1981-07-01","1990-07-01","2001-03-01"), 
  end = c("1970-11-01","1975-03-01","1980-07-01","1982-11-01","1991-03-01","2001-11-01"),
  stringsAsFactors = F
)
library(lubridate)

## Warning: package 'lubridate' was built under R version 3.4.4

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

recess$begin <- ymd (recess$begin)
recess$end <- ymd (recess$end)

# Basic line plot
ggplot(economics, aes(x = date, y = unemploy/pop)) +
  geom_line()

# Expand the following command with geom_rect() to draw the recess periods
ggplot(economics, aes(x = date, y = unemploy/pop)) +
  geom_rect(data = recess,
            aes(xmin = begin, xmax = end, ymin = -Inf, ymax = +Inf),
            inherit.aes = FALSE, fill = "red", alpha = 0.2) +
  geom_line()

load("fish.RData") 
head(fish.species)

##   Year   Pink   Chum Sockeye  Coho Rainbow Chinook Atlantic
## 1 1950 100600 139300   64100 30500       0   23200    10800
## 2 1951 259000 155900   51200 40900     100   25500     9701
## 3 1952 132600 113800   58200 33600     100   24900     9800
## 4 1953 235900  99800   66100 32400     100   25300     8800
## 5 1954 123400 148700   83800 38300     100   24500     9600
## 6 1955 244400 143700   72000 45100     100   27700     7800

# Check the structure as a starting point
str(fish.species)

## 'data.frame':    61 obs. of  8 variables:
##  $ Year    : int  1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 ...
##  $ Pink    : int  100600 259000 132600 235900 123400 244400 203400 270119 200798 200085 ...
##  $ Chum    : int  139300 155900 113800 99800 148700 143700 158480 125377 132407 113114 ...
##  $ Sockeye : int  64100 51200 58200 66100 83800 72000 84800 69676 100520 62472 ...
##  $ Coho    : int  30500 40900 33600 32400 38300 45100 40000 39900 39200 32865 ...
##  $ Rainbow : int  0 100 100 100 100 100 100 100 100 100 ...
##  $ Chinook : int  23200 25500 24900 25300 24500 27700 25300 21200 20900 20335 ...
##  $ Atlantic: int  10800 9701 9800 8800 9600 7800 8100 9000 8801 8700 ...

# Use gather to go from fish.species to fish.tidy
fish.tidy <- gather(fish.species, Species, Capture, -Year)

# Recreate the plot shown on the right
ggplot(fish.tidy, aes(x = Year, y = Capture, color=Species)) +
geom_line()

using qplot

# The old way (shown)
plot(mpg ~ wt, data = mtcars) # formula notation

with(mtcars, plot(wt, mpg)) # x, y notation

# Using ggplot:
ggplot(mtcars, aes(x=wt, y=mpg)) +
  geom_point()

# Using qplot:
qplot(wt, mpg, data = mtcars)

Using aesthetics

# basic qplot scatter plot:
qplot(wt, mpg, data = mtcars)

# Categorical variable mapped onto size:
# cyl
qplot(wt, mpg, data = mtcars, size = factor(cyl))

## Warning: Using size for a discrete variable is not advised.

# gear
qplot(wt, mpg, data = mtcars, size = gear)

# Continuous variable mapped onto col:
# hp
qplot(wt, mpg, data = mtcars, color = hp)

# qsec
qplot(wt, mpg, data = mtcars, color = qsec)

choosing geoms

# qplot() with x only
qplot(x=factor(cyl), data=mtcars)

# qplot() with x and y
qplot(x=factor(cyl),y=factor(vs), data=mtcars)

# qplot() with geom set to jitter manually
qplot(x=factor(cyl),y=factor(vs), data=mtcars, geom="jitter")

Choosing geoms: dotplot Some naming conventions:

Scatter plots: Continuous x, continuous y. Dot plots: Categorical x, continuous y.

# cyl and am are factors, wt is numeric
class(mtcars$cyl)

## [1] "factor"

class(mtcars$am)

## [1] "factor"

class(mtcars$wt)

## [1] "numeric"

# "Basic" dot plot, with geom_point():
ggplot(mtcars, aes(cyl, wt, col = am)) +
  geom_point(position = position_jitter(0.2, 0))

# 1 - "True" dot plot, with geom_dotplot():
ggplot(mtcars, aes(cyl, wt, fill = am)) +
  geom_dotplot(binaxis  = "y", stackdir = "center")

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

# 2 - qplot with geom "dotplot", binaxis = "y" and stackdir = "center"
qplot(
  x=cyl, y=wt,
  data = mtcars,
  fill = am,
  geom = "dotplot",
  binaxis = "y",
  stackdir = "center")

## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#Chicken weight

The ChickWeight dataset is a data frame which represents the progression of weight of several chicks. The little chicklings are each given a specific diet. There are four types of diet and the farmer wants to know which one fattens the chicks the fastest.

# ChickWeight is available in your workspace
# 1 - Check out the head of ChickWeight
head(ChickWeight)

## Grouped Data: weight ~ Time | Chick
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1

# 2 - Basic line plot
ggplot(ChickWeight, aes(x = Time, y = weight)) +
  geom_line(aes(group = Chick))

# 3 - Take plot 2, map Diet onto col.
ggplot(ChickWeight, aes(x = Time, y = weight, color=Diet)) +
  geom_line(aes(group = Chick))

# 4 - Take plot 3, add geom_smooth()
ggplot(ChickWeight, aes(x = Time, y = weight, color=Diet)) +
  geom_line(aes(group = Chick, alpha=0.3))+
  geom_smooth(lwd=2, se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Titanic

You’ve watched the movie Titanic by James Cameron (1997) again and after a good portion of sobbing you decide to investigate whether you’d have a chance of surviving this disaster.

library(titanic)

## Warning: package 'titanic' was built under R version 3.4.4

knitr::kable(head(titanic_train))

PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500		S
2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Thayer)	female	38	1	PC 17599	71.2833	C85	C
3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250		S
4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500		S
6	0	3	Moran, Mr. James	male	NA	0	330877	8.4583		Q

str(titanic_train)

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

titanic<-titanic_train
# 1 - Check the structure of titanic
str(titanic)

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

# 2 - Use ggplot() for the first instruction
ggplot(titanic, aes(x = Pclass, fill = Sex)) +
  geom_bar(position = "dodge")

# 3 - Plot 2, add facet_grid() layer

ggplot(titanic, aes(x = Pclass, fill = Sex)) +
  geom_bar(position = "dodge")+
  facet_grid(.~Survived)

# 4 - Define an object for position jitterdodge, to use below
posn.jd <- position_jitterdodge(0.5, 0, 0.6)

# 5 - Plot 3, but use the position object from instruction 4
ggplot(titanic, aes(x = Pclass,y=Age, color = Sex)) +
  geom_point(position = posn.jd, size=3, alpha=0.5)+
  facet_grid(.~Survived)

## Warning: Removed 177 rows containing missing values (geom_point).

Data Visualization with ggplot2

Manuela da Cruz Chadreque