In this report I will use ggplot2 to visualize different aspects of the diamonds and mpg datasets.
The dataset has 53940 observations and 10 columns.
> # Plot the first 6 rows
> library(knitr)
> library(kableExtra)
> kable(head(diamonds))%>%kable_styling(bootstrap_options=c(
+ "striped","condensed"), full_width=F, position = "left")%>%
+ row_spec(0,background="orchid")| carat | cut | color | clarity | depth | table | price | x | y | z |
|---|---|---|---|---|---|---|---|---|---|
| 0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
| 0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
| 0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
| 0.29 | Premium | I | VS2 | 62.4 | 58 | 334 | 4.20 | 4.23 | 2.63 |
| 0.31 | Good | J | SI2 | 63.3 | 58 | 335 | 4.34 | 4.35 | 2.75 |
| 0.24 | Very Good | J | VVS2 | 62.8 | 57 | 336 | 3.94 | 3.96 | 2.48 |
| Column | Description |
|---|---|
| carat | weight of the diamond |
| cut | quality of the cut (Fair, Good, Very Good, Premium, Ideal) |
| color | from D (best) to J (worst) |
| clarity | from I1 (worst), SI2, VS2, VS1, VVS2, VVS1, IF (best) |
| depth | depth percentage = z/mean(x,y) |
| table | width of top of diamond relative to widest point |
| price | price in US dollars |
| x | lenght in mm |
| y | width in mm |
| z | depth in mm |
The dataset has 234 observations and 11 columns.
> # Plot the first 6 rows
> library(knitr)
> library(kableExtra)
> kable(head(mpg))%>%kable_styling(bootstrap_options=c(
+ "striped","condensed"), full_width=F,
+ position = "left")%>%
+ row_spec(0,background="orchid")| manufacturer | model | displ | year | cyl | trans | drv | cty | hwy | fl | class |
|---|---|---|---|---|---|---|---|---|---|---|
| audi | a4 | 1.8 | 1999 | 4 | auto(l5) | f | 18 | 29 | p | compact |
| audi | a4 | 1.8 | 1999 | 4 | manual(m5) | f | 21 | 29 | p | compact |
| audi | a4 | 2.0 | 2008 | 4 | manual(m6) | f | 20 | 31 | p | compact |
| audi | a4 | 2.0 | 2008 | 4 | auto(av) | f | 21 | 30 | p | compact |
| audi | a4 | 2.8 | 1999 | 6 | auto(l5) | f | 16 | 26 | p | compact |
| audi | a4 | 2.8 | 1999 | 6 | manual(m5) | f | 18 | 26 | p | compact |
| Column | Description |
|---|---|
| manufacturer | manufacturer name |
| model | model name |
| displ | engine displacement, in liters |
| year | year of manufacture |
| cyl | number of cylinders |
| trans | type of transmission |
| drv | f= front-wheel drive, r= rear-wheel drive, 4 = 4wd |
| cty | city miles per gallon |
| hwy | highway miles per gallon |
| fl | fuel type |
| class | “type” of car |
> library(gridExtra)
> # basic scatterplot (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy)) +
+ theme_classic() ->p1
>
> #scatterplot with color by class (right)
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, color=class)) +
+ theme_classic() ->p2
>
> grid.arrange(p1, p2, ncol = 2)The scatterplots show a negative relationship between engine size displ and fuel efficiency mpg. However, there are a group of points that fall outside of the trend (large engine and average fuel efficiency). With a basic scatterplot we cannot determine why. When we color the points by class we can see that the unusal points are 2 seater cars. (Most likely light-weight sports cars)
> #scatterplot with only 126 points (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, color=class)) +
+ theme_classic()+ theme(legend.position="none") ->p1
>
> #scatterplot with all 234 points (right)
> ggplot(data=mpg)+geom_jitter(mapping = aes(
+ x=displ, y=hwy, color=class)) +
+ theme_classic()+ theme(legend.position="none") ->p2
>
> grid.arrange(p1, p2, ncol = 2)Unfortunately the scatterplots show only 126 points even though the dataset has 234. The values are rounded and overlap. In many cases this is fine, but to plot everything a “jitter” plot can be used. This adds a small amount of random noise to each point so that none overlap.
> # Points by alpha (left)
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, alpha=class))+
+ theme_bw() ->p1
>
> #Points by shape (right)
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, shape=class))+
+ theme_bw() ->p2
>
> grid.arrange(p1, p2, ncol = 2)It is also possible to vary the points by alpha or shape.
> # vary point by both color and shape
> ggplot(data=diamonds)+geom_point(mapping = aes(
+ x=carat, y=price, color=cut, shape=cut))+
+ scale_colour_brewer(palette="Spectral")You can also vary by color and shape on the same plot.
> # Smooth lines by drivetrain with standard errors (left)
> ggplot(data=mpg) + geom_smooth(mapping=aes(
+ x=displ, y=hwy, linetype=drv)) ->p1
>
> #Smooth lines by drivetrain with points. No standartd errors (right)
> ggplot(data=mpg, mapping = aes(
+ x=displ, y=hwy, linetype=drv))+
+ geom_point(mapping=aes(color=drv))+
+ geom_smooth(se=FALSE) ->p2
>
> grid.arrange(p1, p2, ncol = 2)Sometimes adding a smooth line helps to visualize the trend.
> # Smooth line with all data (left)
> ggplot(data=mpg, mapping = aes(x=displ, y=hwy))+
+ geom_point(mapping = aes(color=class))+
+ geom_smooth(se=FALSE)+ theme_classic() ->p1
>
> #Smooth line for subcompact cars only (right)
> ggplot(data=mpg, mapping = aes(x=displ, y=hwy))+
+ geom_point(mapping = aes(color=class))+
+ geom_smooth(data=filter(mpg, class == "subcompact"),
+ se=FALSE)+
+ theme_classic() ->p2
>
> grid.arrange(p1, p2, ncol = 2)You can also add a smooth line with filtered data.
It can also be useful to split the plot into facets
> #subplots by class
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, color=class))+
+ facet_wrap(~class, nrow=2)+
+ theme(legend.position = "none")> #by number of cylinders and drivetrain type
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, color=class))+
+ facet_grid(drv ~ cyl)> #by cylinder
> ggplot(data=mpg)+geom_point(mapping = aes(
+ x=displ, y=hwy, color=class))+
+ facet_grid(. ~ cyl)> #diamonds faceted by color. Colored by cut.
> ggplot(data=diamonds)+geom_point(mapping = aes(
+ x=carat, y=price, color=cut))+
+ facet_grid(color ~ .) +
+ scale_colour_brewer(palette="YlGnBu")> #diamonds faceted by color anc ut. Colored by clarity.
> ggplot(data=diamonds)+geom_point(mapping = aes(
+ x=carat, y=price, color=clarity))+
+ facet_grid(color ~ cut) +
+ scale_colour_brewer(palette="YlGnBu")> # Basic bar chart by cut and count (left)
> ggplot(data = diamonds) +
+ geom_bar(mapping = aes(x=cut),fill="darkorange") ->p1
>
> #Basic bar chart by cut and proportion (right)
> ggplot(data = diamonds) +
+ geom_bar(mapping = aes(
+ x=cut, y=..prop.., group=1),fill="darkorange") ->p2
>
> grid.arrange(p1, p2, ncol = 2)It is easy to visualize data by count or proportion.
> # Table with count by cut
> p3 <- diamonds %>% count(cut)
>
> kable(p3)%>%kable_styling(bootstrap_options=c(
+ "striped","condensed"), full_width=FALSE,
+ position = "left")%>%
+ row_spec(0,background="orchid")| cut | n |
|---|---|
| Fair | 1610 |
| Good | 4906 |
| Very Good | 12082 |
| Premium | 13791 |
| Ideal | 21551 |
Here we can see the count values that are displayed
> # Fill by cut (left)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+ x=cut, fill=cut))+
+ scale_fill_brewer(palette="Spectral") ->p1
>
> #Fill by clarity (right)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+ x=cut, fill=clarity))+
+ scale_fill_brewer(palette="Spectral") ->p2
>
> grid.arrange(p1, p2, ncol = 2)When coloring by something other than the x variable the colored bars will be stacked.
> # fill makes each stacked bar the same height (left)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+ x=cut, fill=clarity),position = "fill")+
+ scale_fill_brewer(palette="Paired") ->p1
>
> #dedge places overlapping objects beside one another (right)
> ggplot(data = diamonds) + geom_bar(mapping = aes(
+ x=cut, fill=clarity),position = "dodge")+
+ scale_fill_brewer(palette="Paired") ->p2
>
> grid.arrange(p1, p2, ncol = 2)Changing the position argument makes it easier to compare proportions across groups (fill) or easier to compare individual values (dodge)
> # boxplot with class and hwy (left)
> ggplot(data=mpg, mapping = aes(x=class, y=hwy, fill=class))+
+ geom_boxplot()+theme_bw() ->p1
>
> #flip the x and y axis (right)
> ggplot(data=mpg, mapping = aes(x=class, y=hwy, fill= class))+
+ geom_boxplot()+ coord_flip() + theme_bw() ->p2
>
> # hwy sorted by median
> ggplot(data=mpg)+
+ geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median),
+ y=hwy, fill=class))+
+ theme_bw() ->p3
>
> # hwy sorted by median and axis flipped
> ggplot(data=mpg)+
+ geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median),
+ y=hwy, fill=class))+
+ theme_bw() + coord_flip() -> p4
>
>
> grid.arrange(p1, p2, p3, p4, ncol = 2, nrow = 2)Flipping the x and y axis is useful of there are long labels or if you want a horizontal boxplot. Trends are also easier to see when the data is sorted.
> # Bin by carat (left)
> ggplot(data=diamonds, mapping=aes(x=carat, y=price))+
+ geom_boxplot(mapping=aes(group=cut_width(carat,0.1)),
+ fill="peachpuff") ->p1
>
> #Bin by carat, same number of points each bin (right)
> ggplot(data=diamonds, mapping=aes(x=carat, y=price))+
+ geom_boxplot(mapping=aes(group=cut_number(carat,20)),
+ fill="peachpuff") ->p2
>
> grid.arrange(p1, p2, ncol = 2)It is possible to display a continuous variable by converting them to bins, by either width or count.
> # histogram of carat size with binwidth 0.5 (left)
> ggplot(data=diamonds) +
+ geom_histogram(mapping = aes(x=carat),binwidth=0.5,
+ fill="lightblue")+
+ theme_dark() ->p1
>
> #filter for carat size less than 3
> smaller <- diamonds %>% filter(carat <3)
>
> #histogram of smaller dataset with binwidth 0.1 (right)
> ggplot(data=smaller) +
+ geom_histogram(mapping = aes(x=carat),binwidth=0.1,
+ fill="lightblue")+
+ theme_dark() ->p2
>
> grid.arrange(p1, p2, ncol = 2)In this case it is easier to visualize with a smaller binwidth. The plot has also been filtered for carat size less than 3.
> # Table with count by 0.5 binwidth
> p4 <- diamonds %>% count(cut_width(carat, 0.5))
>
> kable(p4)%>%kable_styling(bootstrap_options=c("striped","condensed"),
+ full_width=FALSE, position = "left")%>%
+ row_spec(0,background="orchid")| cut_width(carat, 0.5) | n |
|---|---|
| [-0.25,0.25] | 785 |
| (0.25,0.75] | 29498 |
| (0.75,1.25] | 15977 |
| (1.25,1.75] | 5313 |
| (1.75,2.25] | 2002 |
| (2.25,2.75] | 322 |
| (2.75,3.25] | 32 |
| (3.25,3.75] | 5 |
| (3.75,4.25] | 4 |
| (4.25,4.75] | 1 |
| (4.75,5.25] | 1 |
The table shows the count for each bin, using 0.5 as the binwidth.
> #binwdith 0.01
> ggplot(data=smaller) +
+ geom_histogram(mapping = aes(x=carat),binwidth=0.01,
+ fill="lightblue")+
+ theme_dark()With an evern smaller binwidth we can better see the clusters at the popular sizes (especially 0.25, 0.50, 1.0, 1.5, and 2.0)
> # histogram of y, the width in mm (left)
> ggplot(diamonds)+geom_histogram(mapping = aes(x=y),
+ color="gold",binwidth=0.5)->p1
>
> # histogram of y with different axis limits (right)
> ggplot(diamonds)+geom_histogram(mapping = aes(x=y),
+ color="gold",binwidth=0.5)+
+ coord_cartesian(ylim=c(0,50)) ->p2
>
> grid.arrange(p1, p2, ncol = 2)If we plot the y variable (width in mm) we can see that the x-axis is unusually wide, signifying the presence of outliers. However, they are difficult to see. They become more evident when the y-axis limits are reduced.
> # Filter for outliers
> outliers <- diamonds %>% filter(y<3 | y>20) %>% arrange(y)
>
> kable(outliers)%>%kable_styling(bootstrap_options=c("striped","condensed"),
+ full_width=FALSE, position = "left")%>%
+ row_spec(0,background="orchid")| carat | cut | color | clarity | depth | table | price | x | y | z |
|---|---|---|---|---|---|---|---|---|---|
| 1.00 | Very Good | H | VS2 | 63.3 | 53 | 5139 | 0.00 | 0.0 | 0.00 |
| 1.14 | Fair | G | VS1 | 57.5 | 67 | 6381 | 0.00 | 0.0 | 0.00 |
| 1.56 | Ideal | G | VS2 | 62.2 | 54 | 12800 | 0.00 | 0.0 | 0.00 |
| 1.20 | Premium | D | VVS1 | 62.1 | 59 | 15686 | 0.00 | 0.0 | 0.00 |
| 2.25 | Premium | H | SI2 | 62.8 | 59 | 18034 | 0.00 | 0.0 | 0.00 |
| 0.71 | Good | F | SI2 | 64.1 | 60 | 2130 | 0.00 | 0.0 | 0.00 |
| 0.71 | Good | F | SI2 | 64.1 | 60 | 2130 | 0.00 | 0.0 | 0.00 |
| 0.51 | Ideal | E | VS1 | 61.8 | 55 | 2075 | 5.15 | 31.8 | 5.12 |
| 2.00 | Premium | H | SI2 | 58.9 | 57 | 12210 | 8.09 | 58.9 | 8.06 |
There are several with a width of 0 (impossible) and two with extremely large widths (not likely given the low prices). These are incorrect measurements and should be removed
> # scatterplot with outliers (left)
> ggplot(data=diamonds, mapping = aes(x=x, y=y))+ geom_point()->p1
>
> # Set incorrect y values to NA
> diamonds2 <- diamonds %>% mutate(y=ifelse(y<3 | y>20, NA, y))
>
> # Set incorrect x values to NA
> diamonds3 <- diamonds2 %>% mutate(x=ifelse(x<3 | x>20, NA, x))
>
> # Plot with NA values removed (right)
> ggplot(data=diamonds3, mapping = aes(x=x, y=y))+ geom_point(na.rm=TRUE) ->p2
>
> grid.arrange(p1, p2, ncol = 2)There is a dramtic difference with the outliers removed.
> #Freqpoly with carat
> ggplot(data=smaller, mapping=aes(x=carat, color=cut)) +
+ geom_freqpoly(binwidth=0.1)+theme_bw()geom_freqpoly is useful for overlaying multiple histograms. It displays the counts with lines instead of bars.
> # freqpoly with price and count (left)
> ggplot(data=diamonds, mapping=aes(x=price))+
+ geom_freqpoly(mapping=aes(color=cut), binwidth=500)+
+ theme_classic()->p1
>
> # freqpoly with price and density (right)
> ggplot(data=diamonds, mapping=aes(x=price, y=..density..))+
+ geom_freqpoly(mapping=aes(color=cut), binwidth=500)+
+ theme_classic() ->p2
>
> grid.arrange(p1, p2, ncol = 2)If the groups have very different sizes it is difficult to make inferences based on count. Instead we can use density, which is the count standardized so that the area under each frequency polygon is one.
> #Density based on price
> ggplot(data=diamonds)+
+ geom_density(mapping=aes(x=price, color=cut))+
+ theme_classic()geom_density is a good alternative to a frequency polygon, and the plot lines are smoother
> #heatmap by color and cut
> diamonds %>% count(color,cut) %>%
+ ggplot(mapping=aes(x=color, y=cut)) +
+ geom_tile(mapping=aes(fill=n))Here we can visualize the count by color and cut.
> # 2D bins with fill color to display count (left)
> ggplot(data=smaller) + geom_bin2d(mapping=aes(x=carat, y=price))->p1
>
> # 2D bins with fill color to display count (right)
> library("hexbin")
> ggplot(data=smaller) + geom_hex(mapping=aes(x=carat, y=price)) ->p2
>
> grid.arrange(p1, p2, ncol = 2)geom_bin2d creates 2D rectangular bins and colors them by count and geom_hex uses hexagonal bins.
> #Visualize summary statistics
> ggplot(data = diamonds) + theme_light() +
+ stat_summary(
+ mapping=aes(x=cut, y=depth),fun.ymin=min,
+ fun.ymax = max, fun.y=median)It is also possibly to visualize summary statistics, like min, max, and median.
> # Common plot options
> ggplot(data = diamonds) +
+ geom_bar(mapping = aes(x=cut, fill=cut))+
+ scale_fill_brewer(palette="Spectral") + # Change color set
+ theme_light() + #Change theme
+ theme(legend.position = "none") + #Remove legend
+ theme(text=element_text(size=15)) + #Increase text size
+ ggtitle("Quality of cut") + #Add Title
+ theme(plot.title = element_text(hjust = 0.5)) + #Center Title
+ xlab("Diamond Cut") + #Change x-axis label
+ ylab("Diamond Count") #Change y-axis label> # Carat, Price, and Cut (left)
> ggplot(data=diamonds)+
+ geom_point(mapping = aes(x=carat, y=price, color=cut))+
+ scale_colour_brewer(palette="Spectral")+
+ facet_grid(.~cut)->p1
>
> # With log transformations (right)
> ggplot(data=diamonds)+
+ geom_point(mapping = aes(x=carat, y=price, color=cut))+
+ scale_colour_brewer(palette="Spectral")+
+ facet_grid(.~cut)+
+ coord_trans(x='log',y='log') ->p2
>
> grid.arrange(p1, p2, ncol = 2)> # Carat & Price with a linear line (left)
> ggplot(data=diamonds,mapping = aes(x=carat, y=price))+
+ geom_point()+ geom_smooth(method = 'lm')->p1
>
> # With log transformations (right)
> ggplot(data=diamonds,mapping = aes(x=log(carat), y=log(price)))+
+ geom_point()+ geom_smooth(method = 'lm') ->p2
>
> grid.arrange(p1, p2, ncol = 2)