Introduction

In this report I will use ggplot2 to visualize different aspects of the diamonds and mpg datasets.

Diamonds Dataset


> library(tidyverse)
> data(diamonds)

The dataset has 53940 observations and 10 columns.

Sample

> # Plot the first 6 rows
> library(knitr)
> library(kableExtra)
> kable(head(diamonds))%>%kable_styling(bootstrap_options=c(
+   "striped","condensed"), full_width=F, position = "left")%>%
+   row_spec(0,background="orchid")
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48

Description

Column Description
carat weight of the diamond
cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)
color from D (best) to J (worst)
clarity from I1 (worst), SI2, VS2, VS1, VVS2, VVS1, IF (best)
depth depth percentage = z/mean(x,y)
table width of top of diamond relative to widest point
price price in US dollars
x lenght in mm
y width in mm
z depth in mm

MPG Dataset


> data(mpg)

The dataset has 234 observations and 11 columns.

Sample

> # Plot the first 6 rows
> library(knitr)
> library(kableExtra)
> kable(head(mpg))%>%kable_styling(bootstrap_options=c(
+   "striped","condensed"), full_width=F, 
+   position = "left")%>%
+   row_spec(0,background="orchid")
manufacturer model displ year cyl trans drv cty hwy fl class
audi a4 1.8 1999 4 auto(l5) f 18 29 p compact
audi a4 1.8 1999 4 manual(m5) f 21 29 p compact
audi a4 2.0 2008 4 manual(m6) f 20 31 p compact
audi a4 2.0 2008 4 auto(av) f 21 30 p compact
audi a4 2.8 1999 6 auto(l5) f 16 26 p compact
audi a4 2.8 1999 6 manual(m5) f 18 26 p compact

Description

Column Description
manufacturer manufacturer name
model model name
displ engine displacement, in liters
year year of manufacture
cyl number of cylinders
trans type of transmission
drv f= front-wheel drive, r= rear-wheel drive, 4 = 4wd
cty city miles per gallon
hwy highway miles per gallon
fl fuel type
class “type” of car

Visualizations


Scatterplots

> library(gridExtra)
> # basic scatterplot (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy)) + 
+ theme_classic() ->p1
> 
> #scatterplot with color by class (right)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class)) + 
+ theme_classic() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

The scatterplots show a negative relationship between engine size displ and fuel efficiency mpg. However, there are a group of points that fall outside of the trend (large engine and average fuel efficiency). With a basic scatterplot we cannot determine why. When we color the points by class we can see that the unusal points are 2 seater cars. (Most likely light-weight sports cars)

> #scatterplot with only 126 points (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class)) + 
+ theme_classic()+ theme(legend.position="none") ->p1
> 
> #scatterplot with all 234 points (right)
> ggplot(data=mpg)+geom_jitter(mapping = aes(
+   x=displ, y=hwy, color=class)) + 
+ theme_classic()+ theme(legend.position="none") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Unfortunately the scatterplots show only 126 points even though the dataset has 234. The values are rounded and overlap. In many cases this is fine, but to plot everything a “jitter” plot can be used. This adds a small amount of random noise to each point so that none overlap.

> # Points by alpha (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, alpha=class))+
+   theme_bw() ->p1
> 
> #Points by shape (right)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, shape=class))+
+   theme_bw() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

It is also possible to vary the points by alpha or shape.

> # vary point by both color and shape
> ggplot(data=diamonds)+geom_point(mapping = aes(
+   x=carat, y=price, color=cut, shape=cut))+
+   scale_colour_brewer(palette="Spectral")

You can also vary by color and shape on the same plot.

> # Smooth lines by drivetrain with standard errors (left)
> ggplot(data=mpg) + geom_smooth(mapping=aes(
+   x=displ, y=hwy, linetype=drv)) ->p1
> 
> #Smooth lines by drivetrain with points.  No standartd errors (right)
> ggplot(data=mpg, mapping = aes(
+   x=displ, y=hwy, linetype=drv))+
+   geom_point(mapping=aes(color=drv))+
+   geom_smooth(se=FALSE) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Sometimes adding a smooth line helps to visualize the trend.

> # Smooth line with all data (left)
> ggplot(data=mpg, mapping = aes(x=displ, y=hwy))+
+   geom_point(mapping = aes(color=class))+
+   geom_smooth(se=FALSE)+ theme_classic() ->p1
> 
> #Smooth line for subcompact cars only (right)
> ggplot(data=mpg, mapping = aes(x=displ, y=hwy))+
+   geom_point(mapping = aes(color=class))+
+   geom_smooth(data=filter(mpg, class == "subcompact"),
+               se=FALSE)+ 
+   theme_classic() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

You can also add a smooth line with filtered data.

Facets

It can also be useful to split the plot into facets

> #subplots by class
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class))+
+   facet_wrap(~class, nrow=2)+
+   theme(legend.position = "none")

> #by number of cylinders and drivetrain type
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class))+
+   facet_grid(drv ~ cyl)

> #by cylinder
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class))+
+   facet_grid(. ~ cyl)

> #diamonds faceted by color.  Colored by cut.
> ggplot(data=diamonds)+geom_point(mapping = aes(
+   x=carat, y=price, color=cut))+
+   facet_grid(color ~ .) + 
+   scale_colour_brewer(palette="YlGnBu")

> #diamonds faceted by color anc ut.  Colored by clarity.
> ggplot(data=diamonds)+geom_point(mapping = aes(
+   x=carat, y=price, color=clarity))+
+   facet_grid(color ~ cut) +   
+   scale_colour_brewer(palette="YlGnBu")

Bar Charts

> # Basic bar chart by cut and count (left)
> ggplot(data = diamonds) + 
+   geom_bar(mapping = aes(x=cut),fill="darkorange") ->p1
> 
> #Basic bar chart by cut and proportion (right)
> ggplot(data = diamonds) + 
+   geom_bar(mapping = aes(
+     x=cut, y=..prop.., group=1),fill="darkorange") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

It is easy to visualize data by count or proportion.

> # Table with count by cut
> p3 <- diamonds %>% count(cut)
> 
> kable(p3)%>%kable_styling(bootstrap_options=c(
+   "striped","condensed"), full_width=FALSE, 
+   position = "left")%>%
+   row_spec(0,background="orchid")
cut n
Fair 1610
Good 4906
Very Good 12082
Premium 13791
Ideal 21551

Here we can see the count values that are displayed

> # Fill by cut (left)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=cut))+ 
+   scale_fill_brewer(palette="Spectral") ->p1
> 
> #Fill by clarity (right)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=clarity))+ 
+   scale_fill_brewer(palette="Spectral") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

When coloring by something other than the x variable the colored bars will be stacked.

> # fill makes each stacked bar the same height (left)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=clarity),position = "fill")+ 
+   scale_fill_brewer(palette="Paired") ->p1
> 
> #dedge places overlapping objects beside one another (right)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=clarity),position = "dodge")+ 
+   scale_fill_brewer(palette="Paired") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Changing the position argument makes it easier to compare proportions across groups (fill) or easier to compare individual values (dodge)

Boxplots

> # boxplot with class and hwy (left)
> ggplot(data=mpg, mapping = aes(x=class, y=hwy, fill=class))+
+   geom_boxplot()+theme_bw() ->p1
> 
> #flip the x and y axis (right)
> ggplot(data=mpg, mapping = aes(x=class, y=hwy, fill= class))+
+   geom_boxplot()+ coord_flip() + theme_bw() ->p2
> 
> # hwy sorted by median
> ggplot(data=mpg)+
+   geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median),
+                            y=hwy, fill=class))+
+   theme_bw() ->p3
> 
> # hwy sorted by median and axis flipped
> ggplot(data=mpg)+
+   geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median),
+                            y=hwy, fill=class))+
+   theme_bw() + coord_flip() -> p4
> 
> 
> grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)

Flipping the x and y axis is useful of there are long labels or if you want a horizontal boxplot. Trends are also easier to see when the data is sorted.

> # Bin by carat (left)
> ggplot(data=diamonds, mapping=aes(x=carat, y=price))+
+   geom_boxplot(mapping=aes(group=cut_width(carat,0.1)),
+                fill="peachpuff") ->p1
> 
> #Bin by carat, same number of points each bin (right)
> ggplot(data=diamonds, mapping=aes(x=carat, y=price))+
+   geom_boxplot(mapping=aes(group=cut_number(carat,20)),
+                fill="peachpuff") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

It is possible to display a continuous variable by converting them to bins, by either width or count.

Histograms

> # histogram of carat size with binwidth 0.5 (left)
> ggplot(data=diamonds) + 
+   geom_histogram(mapping = aes(x=carat),binwidth=0.5, 
+                  fill="lightblue")+
+   theme_dark() ->p1
> 
> #filter for carat size less than 3
> smaller <- diamonds %>% filter(carat <3)
> 
> #histogram of smaller dataset with binwidth 0.1 (right)
> ggplot(data=smaller) + 
+   geom_histogram(mapping = aes(x=carat),binwidth=0.1, 
+                  fill="lightblue")+
+   theme_dark() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

In this case it is easier to visualize with a smaller binwidth. The plot has also been filtered for carat size less than 3.

> # Table with count by 0.5 binwidth
> p4 <- diamonds %>% count(cut_width(carat, 0.5))
> 
> kable(p4)%>%kable_styling(bootstrap_options=c("striped","condensed"), 
+   full_width=FALSE, position = "left")%>%
+   row_spec(0,background="orchid")
cut_width(carat, 0.5) n
[-0.25,0.25] 785
(0.25,0.75] 29498
(0.75,1.25] 15977
(1.25,1.75] 5313
(1.75,2.25] 2002
(2.25,2.75] 322
(2.75,3.25] 32
(3.25,3.75] 5
(3.75,4.25] 4
(4.25,4.75] 1
(4.75,5.25] 1

The table shows the count for each bin, using 0.5 as the binwidth.

> #binwdith 0.01
> ggplot(data=smaller) + 
+   geom_histogram(mapping = aes(x=carat),binwidth=0.01, 
+                  fill="lightblue")+
+   theme_dark()

With an evern smaller binwidth we can better see the clusters at the popular sizes (especially 0.25, 0.50, 1.0, 1.5, and 2.0)

> # histogram of y, the width in mm (left)
> ggplot(diamonds)+geom_histogram(mapping = aes(x=y), 
+                     color="gold",binwidth=0.5)->p1
> 
> # histogram of y with different axis limits (right)
> ggplot(diamonds)+geom_histogram(mapping = aes(x=y), 
+                         color="gold",binwidth=0.5)+
+   coord_cartesian(ylim=c(0,50)) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

If we plot the y variable (width in mm) we can see that the x-axis is unusually wide, signifying the presence of outliers. However, they are difficult to see. They become more evident when the y-axis limits are reduced.

Outliers

> # Filter for outliers
> outliers <- diamonds %>% filter(y<3 | y>20) %>% arrange(y)
> 
> kable(outliers)%>%kable_styling(bootstrap_options=c("striped","condensed"), 
+             full_width=FALSE, position = "left")%>%
+   row_spec(0,background="orchid")
carat cut color clarity depth table price x y z
1.00 Very Good H VS2 63.3 53 5139 0.00 0.0 0.00
1.14 Fair G VS1 57.5 67 6381 0.00 0.0 0.00
1.56 Ideal G VS2 62.2 54 12800 0.00 0.0 0.00
1.20 Premium D VVS1 62.1 59 15686 0.00 0.0 0.00
2.25 Premium H SI2 62.8 59 18034 0.00 0.0 0.00
0.71 Good F SI2 64.1 60 2130 0.00 0.0 0.00
0.71 Good F SI2 64.1 60 2130 0.00 0.0 0.00
0.51 Ideal E VS1 61.8 55 2075 5.15 31.8 5.12
2.00 Premium H SI2 58.9 57 12210 8.09 58.9 8.06

There are several with a width of 0 (impossible) and two with extremely large widths (not likely given the low prices). These are incorrect measurements and should be removed

> # scatterplot with outliers (left)
> ggplot(data=diamonds, mapping = aes(x=x, y=y))+ geom_point()->p1
> 
> # Set incorrect y values to NA
> diamonds2 <- diamonds %>% mutate(y=ifelse(y<3 | y>20, NA, y))
> 
> # Set incorrect x values to NA
> diamonds3 <- diamonds2 %>% mutate(x=ifelse(x<3 | x>20, NA, x))
> 
> # Plot with NA values removed (right)
> ggplot(data=diamonds3, mapping = aes(x=x, y=y))+ geom_point(na.rm=TRUE) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

There is a dramtic difference with the outliers removed.

Frequency Polygon

> #Freqpoly with carat
> ggplot(data=smaller, mapping=aes(x=carat, color=cut)) + 
+     geom_freqpoly(binwidth=0.1)+theme_bw()

geom_freqpoly is useful for overlaying multiple histograms. It displays the counts with lines instead of bars.

> # freqpoly with price and count (left)
> ggplot(data=diamonds, mapping=aes(x=price))+
+   geom_freqpoly(mapping=aes(color=cut), binwidth=500)+
+   theme_classic()->p1
> 
> # freqpoly with price and density (right)
> ggplot(data=diamonds, mapping=aes(x=price, y=..density..))+
+   geom_freqpoly(mapping=aes(color=cut), binwidth=500)+
+   theme_classic() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

If the groups have very different sizes it is difficult to make inferences based on count. Instead we can use density, which is the count standardized so that the area under each frequency polygon is one.

Density

> #Density based on price
> ggplot(data=diamonds)+
+   geom_density(mapping=aes(x=price, color=cut))+
+   theme_classic()

geom_density is a good alternative to a frequency polygon, and the plot lines are smoother

Heatmap

> #heatmap by color and cut
> diamonds %>% count(color,cut) %>%
+   ggplot(mapping=aes(x=color, y=cut)) + 
+   geom_tile(mapping=aes(fill=n))

Here we can visualize the count by color and cut.

Bin2d and Hex

> # 2D bins with fill color to display count (left)
> ggplot(data=smaller) + geom_bin2d(mapping=aes(x=carat, y=price))->p1
> 
> # 2D bins with fill color to display count  (right)
> library("hexbin")
> ggplot(data=smaller) + geom_hex(mapping=aes(x=carat, y=price)) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

geom_bin2d creates 2D rectangular bins and colors them by count and geom_hex uses hexagonal bins.

Pairs

> # A Useful way to view correlations
> library(GGally)
> ggpairs(diamonds[,5:10])

Stat Summary

> #Visualize summary statistics
> ggplot(data = diamonds) + theme_light() +
+   stat_summary(
+     mapping=aes(x=cut, y=depth),fun.ymin=min,
+     fun.ymax = max, fun.y=median)

It is also possibly to visualize summary statistics, like min, max, and median.

Plot Options

> # Common plot options
> ggplot(data = diamonds) + 
+   geom_bar(mapping = aes(x=cut, fill=cut))+ 
+   scale_fill_brewer(palette="Spectral") + # Change color set
+   theme_light() + #Change theme
+   theme(legend.position = "none") + #Remove legend
+   theme(text=element_text(size=15)) + #Increase text size
+   ggtitle("Quality of cut") + #Add Title
+   theme(plot.title = element_text(hjust = 0.5)) + #Center Title
+   xlab("Diamond Cut") + #Change x-axis label
+   ylab("Diamond Count")  #Change y-axis label

Transformations

> # Carat, Price, and Cut (left)
> ggplot(data=diamonds)+
+   geom_point(mapping = aes(x=carat, y=price, color=cut))+
+   scale_colour_brewer(palette="Spectral")+ 
+   facet_grid(.~cut)->p1
> 
> # With log transformations  (right)
> ggplot(data=diamonds)+
+   geom_point(mapping = aes(x=carat, y=price, color=cut))+
+   scale_colour_brewer(palette="Spectral")+ 
+   facet_grid(.~cut)+
+   coord_trans(x='log',y='log') ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

> # Carat & Price with a linear line (left)
> ggplot(data=diamonds,mapping = aes(x=carat, y=price))+
+   geom_point()+ geom_smooth(method = 'lm')->p1
> 
> # With log transformations  (right)
> ggplot(data=diamonds,mapping = aes(x=log(carat), y=log(price)))+
+   geom_point()+ geom_smooth(method = 'lm') ->p2
> 
> grid.arrange(p1, p2, ncol = 2)