Introduction

In this report I will use ggplot2 to visualize different aspects of the diamonds and mpg datasets.

Diamonds Dataset

> library(tidyverse)
> data(diamonds)

The dataset has 53940 observations and 10 columns.

Sample

> # Plot the first 6 rows
> library(knitr)
> library(kableExtra)
> kable(head(diamonds))%>%kable_styling(bootstrap_options=c(
+   "striped","condensed"), full_width=F, position = "left")%>%
+   row_spec(0,background="orchid")

carat	cut	color	clarity	depth	table	price	x	y	z
0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
0.29	Premium	I	VS2	62.4	58	334	4.20	4.23	2.63
0.31	Good	J	SI2	63.3	58	335	4.34	4.35	2.75
0.24	Very Good	J	VVS2	62.8	57	336	3.94	3.96	2.48

Description

Column	Description
carat	weight of the diamond
cut	quality of the cut (Fair, Good, Very Good, Premium, Ideal)
color	from D (best) to J (worst)
clarity	from I1 (worst), SI2, VS2, VS1, VVS2, VVS1, IF (best)
depth	depth percentage = z/mean(x,y)
table	width of top of diamond relative to widest point
price	price in US dollars
x	lenght in mm
y	width in mm
z	depth in mm

MPG Dataset

> data(mpg)

The dataset has 234 observations and 11 columns.

Sample

> # Plot the first 6 rows
> library(knitr)
> library(kableExtra)
> kable(head(mpg))%>%kable_styling(bootstrap_options=c(
+   "striped","condensed"), full_width=F, 
+   position = "left")%>%
+   row_spec(0,background="orchid")

manufacturer	model	displ	year	cyl	trans	drv	cty	hwy	fl	class
audi	a4	1.8	1999	4	auto(l5)	f	18	29	p	compact
audi	a4	1.8	1999	4	manual(m5)	f	21	29	p	compact
audi	a4	2.0	2008	4	manual(m6)	f	20	31	p	compact
audi	a4	2.0	2008	4	auto(av)	f	21	30	p	compact
audi	a4	2.8	1999	6	auto(l5)	f	16	26	p	compact
audi	a4	2.8	1999	6	manual(m5)	f	18	26	p	compact

Description

Column	Description
manufacturer	manufacturer name
model	model name
displ	engine displacement, in liters
year	year of manufacture
cyl	number of cylinders
trans	type of transmission
drv	f= front-wheel drive, r= rear-wheel drive, 4 = 4wd
cty	city miles per gallon
hwy	highway miles per gallon
fl	fuel type
class	“type” of car

Visualizations

Scatterplots

> library(gridExtra)
> # basic scatterplot (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy)) + 
+ theme_classic() ->p1
> 
> #scatterplot with color by class (right)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class)) + 
+ theme_classic() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

The scatterplots show a negative relationship between engine size displ and fuel efficiency mpg. However, there are a group of points that fall outside of the trend (large engine and average fuel efficiency). With a basic scatterplot we cannot determine why. When we color the points by class we can see that the unusal points are 2 seater cars. (Most likely light-weight sports cars)

> #scatterplot with only 126 points (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class)) + 
+ theme_classic()+ theme(legend.position="none") ->p1
> 
> #scatterplot with all 234 points (right)
> ggplot(data=mpg)+geom_jitter(mapping = aes(
+   x=displ, y=hwy, color=class)) + 
+ theme_classic()+ theme(legend.position="none") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Unfortunately the scatterplots show only 126 points even though the dataset has 234. The values are rounded and overlap. In many cases this is fine, but to plot everything a “jitter” plot can be used. This adds a small amount of random noise to each point so that none overlap.

> # Points by alpha (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, alpha=class))+
+   theme_bw() ->p1
> 
> #Points by shape (right)
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, shape=class))+
+   theme_bw() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

It is also possible to vary the points by alpha or shape.

> # vary point by both color and shape
> ggplot(data=diamonds)+geom_point(mapping = aes(
+   x=carat, y=price, color=cut, shape=cut))+
+   scale_colour_brewer(palette="Spectral")

You can also vary by color and shape on the same plot.

> # Smooth lines by drivetrain with standard errors (left)
> ggplot(data=mpg) + geom_smooth(mapping=aes(
+   x=displ, y=hwy, linetype=drv)) ->p1
> 
> #Smooth lines by drivetrain with points.  No standartd errors (right)
> ggplot(data=mpg, mapping = aes(
+   x=displ, y=hwy, linetype=drv))+
+   geom_point(mapping=aes(color=drv))+
+   geom_smooth(se=FALSE) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Sometimes adding a smooth line helps to visualize the trend.

> # Smooth line with all data (left)
> ggplot(data=mpg, mapping = aes(x=displ, y=hwy))+
+   geom_point(mapping = aes(color=class))+
+   geom_smooth(se=FALSE)+ theme_classic() ->p1
> 
> #Smooth line for subcompact cars only (right)
> ggplot(data=mpg, mapping = aes(x=displ, y=hwy))+
+   geom_point(mapping = aes(color=class))+
+   geom_smooth(data=filter(mpg, class == "subcompact"),
+               se=FALSE)+ 
+   theme_classic() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

You can also add a smooth line with filtered data.

Facets

It can also be useful to split the plot into facets

> #subplots by class
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class))+
+   facet_wrap(~class, nrow=2)+
+   theme(legend.position = "none")

> #by number of cylinders and drivetrain type
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class))+
+   facet_grid(drv ~ cyl)

> #by cylinder
> ggplot(data=mpg)+geom_point(mapping = aes(
+   x=displ, y=hwy, color=class))+
+   facet_grid(. ~ cyl)

> #diamonds faceted by color.  Colored by cut.
> ggplot(data=diamonds)+geom_point(mapping = aes(
+   x=carat, y=price, color=cut))+
+   facet_grid(color ~ .) + 
+   scale_colour_brewer(palette="YlGnBu")

> #diamonds faceted by color anc ut.  Colored by clarity.
> ggplot(data=diamonds)+geom_point(mapping = aes(
+   x=carat, y=price, color=clarity))+
+   facet_grid(color ~ cut) +   
+   scale_colour_brewer(palette="YlGnBu")

Bar Charts

> # Basic bar chart by cut and count (left)
> ggplot(data = diamonds) + 
+   geom_bar(mapping = aes(x=cut),fill="darkorange") ->p1
> 
> #Basic bar chart by cut and proportion (right)
> ggplot(data = diamonds) + 
+   geom_bar(mapping = aes(
+     x=cut, y=..prop.., group=1),fill="darkorange") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

It is easy to visualize data by count or proportion.

> # Table with count by cut
> p3 <- diamonds %>% count(cut)
> 
> kable(p3)%>%kable_styling(bootstrap_options=c(
+   "striped","condensed"), full_width=FALSE, 
+   position = "left")%>%
+   row_spec(0,background="orchid")

cut	n
Fair	1610
Good	4906
Very Good	12082
Premium	13791
Ideal	21551

Here we can see the count values that are displayed

> # Fill by cut (left)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=cut))+ 
+   scale_fill_brewer(palette="Spectral") ->p1
> 
> #Fill by clarity (right)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=clarity))+ 
+   scale_fill_brewer(palette="Spectral") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

When coloring by something other than the x variable the colored bars will be stacked.

> # fill makes each stacked bar the same height (left)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=clarity),position = "fill")+ 
+   scale_fill_brewer(palette="Paired") ->p1
> 
> #dedge places overlapping objects beside one another (right)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+   x=cut, fill=clarity),position = "dodge")+ 
+   scale_fill_brewer(palette="Paired") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Changing the position argument makes it easier to compare proportions across groups (fill) or easier to compare individual values (dodge)

Boxplots

> # boxplot with class and hwy (left)
> ggplot(data=mpg, mapping = aes(x=class, y=hwy, fill=class))+
+   geom_boxplot()+theme_bw() ->p1
> 
> #flip the x and y axis (right)
> ggplot(data=mpg, mapping = aes(x=class, y=hwy, fill= class))+
+   geom_boxplot()+ coord_flip() + theme_bw() ->p2
> 
> # hwy sorted by median
> ggplot(data=mpg)+
+   geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median),
+                            y=hwy, fill=class))+
+   theme_bw() ->p3
> 
> # hwy sorted by median and axis flipped
> ggplot(data=mpg)+
+   geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median),
+                            y=hwy, fill=class))+
+   theme_bw() + coord_flip() -> p4
> 
> 
> grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)

Flipping the x and y axis is useful of there are long labels or if you want a horizontal boxplot. Trends are also easier to see when the data is sorted.

> # Bin by carat (left)
> ggplot(data=diamonds, mapping=aes(x=carat, y=price))+
+   geom_boxplot(mapping=aes(group=cut_width(carat,0.1)),
+                fill="peachpuff") ->p1
> 
> #Bin by carat, same number of points each bin (right)
> ggplot(data=diamonds, mapping=aes(x=carat, y=price))+
+   geom_boxplot(mapping=aes(group=cut_number(carat,20)),
+                fill="peachpuff") ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

It is possible to display a continuous variable by converting them to bins, by either width or count.

Histograms

> # histogram of carat size with binwidth 0.5 (left)
> ggplot(data=diamonds) + 
+   geom_histogram(mapping = aes(x=carat),binwidth=0.5, 
+                  fill="lightblue")+
+   theme_dark() ->p1
> 
> #filter for carat size less than 3
> smaller <- diamonds %>% filter(carat <3)
> 
> #histogram of smaller dataset with binwidth 0.1 (right)
> ggplot(data=smaller) + 
+   geom_histogram(mapping = aes(x=carat),binwidth=0.1, 
+                  fill="lightblue")+
+   theme_dark() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

In this case it is easier to visualize with a smaller binwidth. The plot has also been filtered for carat size less than 3.

> # Table with count by 0.5 binwidth
> p4 <- diamonds %>% count(cut_width(carat, 0.5))
> 
> kable(p4)%>%kable_styling(bootstrap_options=c("striped","condensed"), 
+   full_width=FALSE, position = "left")%>%
+   row_spec(0,background="orchid")

cut_width(carat, 0.5)	n
[-0.25,0.25]	785
(0.25,0.75]	29498
(0.75,1.25]	15977
(1.25,1.75]	5313
(1.75,2.25]	2002
(2.25,2.75]	322
(2.75,3.25]	32
(3.25,3.75]	5
(3.75,4.25]	4
(4.25,4.75]	1
(4.75,5.25]	1

The table shows the count for each bin, using 0.5 as the binwidth.

> #binwdith 0.01
> ggplot(data=smaller) + 
+   geom_histogram(mapping = aes(x=carat),binwidth=0.01, 
+                  fill="lightblue")+
+   theme_dark()

With an evern smaller binwidth we can better see the clusters at the popular sizes (especially 0.25, 0.50, 1.0, 1.5, and 2.0)

> # histogram of y, the width in mm (left)
> ggplot(diamonds)+geom_histogram(mapping = aes(x=y), 
+                     color="gold",binwidth=0.5)->p1
> 
> # histogram of y with different axis limits (right)
> ggplot(diamonds)+geom_histogram(mapping = aes(x=y), 
+                         color="gold",binwidth=0.5)+
+   coord_cartesian(ylim=c(0,50)) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

If we plot the y variable (width in mm) we can see that the x-axis is unusually wide, signifying the presence of outliers. However, they are difficult to see. They become more evident when the y-axis limits are reduced.

Outliers

> # Filter for outliers
> outliers <- diamonds %>% filter(y<3 | y>20) %>% arrange(y)
> 
> kable(outliers)%>%kable_styling(bootstrap_options=c("striped","condensed"), 
+             full_width=FALSE, position = "left")%>%
+   row_spec(0,background="orchid")

carat	cut	color	clarity	depth	table	price	x	y	z
1.00	Very Good	H	VS2	63.3	53	5139	0.00	0.0	0.00
1.14	Fair	G	VS1	57.5	67	6381	0.00	0.0	0.00
1.56	Ideal	G	VS2	62.2	54	12800	0.00	0.0	0.00
1.20	Premium	D	VVS1	62.1	59	15686	0.00	0.0	0.00
2.25	Premium	H	SI2	62.8	59	18034	0.00	0.0	0.00
0.71	Good	F	SI2	64.1	60	2130	0.00	0.0	0.00
0.71	Good	F	SI2	64.1	60	2130	0.00	0.0	0.00
0.51	Ideal	E	VS1	61.8	55	2075	5.15	31.8	5.12
2.00	Premium	H	SI2	58.9	57	12210	8.09	58.9	8.06

There are several with a width of 0 (impossible) and two with extremely large widths (not likely given the low prices). These are incorrect measurements and should be removed

> # scatterplot with outliers (left)
> ggplot(data=diamonds, mapping = aes(x=x, y=y))+ geom_point()->p1
> 
> # Set incorrect y values to NA
> diamonds2 <- diamonds %>% mutate(y=ifelse(y<3 | y>20, NA, y))
> 
> # Set incorrect x values to NA
> diamonds3 <- diamonds2 %>% mutate(x=ifelse(x<3 | x>20, NA, x))
> 
> # Plot with NA values removed (right)
> ggplot(data=diamonds3, mapping = aes(x=x, y=y))+ geom_point(na.rm=TRUE) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

There is a dramtic difference with the outliers removed.

Frequency Polygon

> #Freqpoly with carat
> ggplot(data=smaller, mapping=aes(x=carat, color=cut)) + 
+     geom_freqpoly(binwidth=0.1)+theme_bw()

geom_freqpoly is useful for overlaying multiple histograms. It displays the counts with lines instead of bars.

> # freqpoly with price and count (left)
> ggplot(data=diamonds, mapping=aes(x=price))+
+   geom_freqpoly(mapping=aes(color=cut), binwidth=500)+
+   theme_classic()->p1
> 
> # freqpoly with price and density (right)
> ggplot(data=diamonds, mapping=aes(x=price, y=..density..))+
+   geom_freqpoly(mapping=aes(color=cut), binwidth=500)+
+   theme_classic() ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

If the groups have very different sizes it is difficult to make inferences based on count. Instead we can use density, which is the count standardized so that the area under each frequency polygon is one.

Density

> #Density based on price
> ggplot(data=diamonds)+
+   geom_density(mapping=aes(x=price, color=cut))+
+   theme_classic()

geom_density is a good alternative to a frequency polygon, and the plot lines are smoother

Heatmap

> #heatmap by color and cut
> diamonds %>% count(color,cut) %>%
+   ggplot(mapping=aes(x=color, y=cut)) + 
+   geom_tile(mapping=aes(fill=n))

Here we can visualize the count by color and cut.

Bin2d and Hex

> # 2D bins with fill color to display count (left)
> ggplot(data=smaller) + geom_bin2d(mapping=aes(x=carat, y=price))->p1
> 
> # 2D bins with fill color to display count  (right)
> library("hexbin")
> ggplot(data=smaller) + geom_hex(mapping=aes(x=carat, y=price)) ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

geom_bin2d creates 2D rectangular bins and colors them by count and geom_hex uses hexagonal bins.

Pairs

> # A Useful way to view correlations
> library(GGally)
> ggpairs(diamonds[,5:10])

Stat Summary

> #Visualize summary statistics
> ggplot(data = diamonds) + theme_light() +
+   stat_summary(
+     mapping=aes(x=cut, y=depth),fun.ymin=min,
+     fun.ymax = max, fun.y=median)

It is also possibly to visualize summary statistics, like min, max, and median.

Plot Options

> # Common plot options
> ggplot(data = diamonds) + 
+   geom_bar(mapping = aes(x=cut, fill=cut))+ 
+   scale_fill_brewer(palette="Spectral") + # Change color set
+   theme_light() + #Change theme
+   theme(legend.position = "none") + #Remove legend
+   theme(text=element_text(size=15)) + #Increase text size
+   ggtitle("Quality of cut") + #Add Title
+   theme(plot.title = element_text(hjust = 0.5)) + #Center Title
+   xlab("Diamond Cut") + #Change x-axis label
+   ylab("Diamond Count")  #Change y-axis label

Transformations

> # Carat, Price, and Cut (left)
> ggplot(data=diamonds)+
+   geom_point(mapping = aes(x=carat, y=price, color=cut))+
+   scale_colour_brewer(palette="Spectral")+ 
+   facet_grid(.~cut)->p1
> 
> # With log transformations  (right)
> ggplot(data=diamonds)+
+   geom_point(mapping = aes(x=carat, y=price, color=cut))+
+   scale_colour_brewer(palette="Spectral")+ 
+   facet_grid(.~cut)+
+   coord_trans(x='log',y='log') ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

> # Carat & Price with a linear line (left)
> ggplot(data=diamonds,mapping = aes(x=carat, y=price))+
+   geom_point()+ geom_smooth(method = 'lm')->p1
> 
> # With log transformations  (right)
> ggplot(data=diamonds,mapping = aes(x=log(carat), y=log(price)))+
+   geom_point()+ geom_smooth(method = 'lm') ->p2
> 
> grid.arrange(p1, p2, ncol = 2)

Data Visualization with ggplot2

R Markdown

Paul Jozefek

3/15/2020

Introduction

Diamonds Dataset

Sample

Description

MPG Dataset

Sample

Description

Visualizations

Scatterplots

Facets

Bar Charts

Boxplots

Histograms

Outliers

Frequency Polygon

Density

Heatmap

Bin2d and Hex

Pairs

Stat Summary

Plot Options

Transformations