Explore how to create various types of plots in R using the diamonds data set, which is available in the ggplot2 package
library(ggplot2)
library(tidyverse)
? before the name of the data setR Documentation pages?diamonds
| diamonds | R Documentation |
A dataset containing the prices and other attributes of almost 54,000 diamonds. The variables are as follows:
A data frame with 53940 rows and 10 variables:
price in US dollars ($326–$18,823)
weight of the diamond (0.2–5.01)
quality of the cut (Fair, Good, Very Good, Premium, Ideal)
diamond colour, from D (best) to J (worst)
a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
length in mm (0–10.74)
width in mm (0–58.9)
depth in mm (0–31.8)
total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43–79)
width of top of diamond relative to widest point (43–95)
data(diamonds)
head(diamonds, 12) # prints the first dozen rows of data
## # A tibble: 12 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## 11 0.3 Good J SI1 64 55 339 4.25 4.28 2.73
## 12 0.23 Ideal J VS1 62.8 56 340 3.93 3.9 2.46
str(diamonds) # summarizes data structure
## tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
qplot()qplot() is a shortcut designed to be familiar if you’re used to base plot().cutqplot(cut, data=diamonds) # cut is an ordered factor with 5 levels
barcut = qplot(cut, data=diamonds)
barcut
cutbarcut = barcut + geom_text(stat='count',aes(label=..count..),vjust=-1)
barcut
barcut = barcut + ylim(0,25000)
barcut
barcut = barcut + theme(text=element_text(size=14))
barcut
barcut = barcut + geom_bar(fill='lightsteelblue4')
barcut
For more fill options, see here
barcut = barcut + ggtitle("Quality of cut")
barcut
barcut = barcut + theme(plot.title = element_text(hjust = 0.5))
barcut
barcut = barcut + theme_minimal()
barcut
barcut = qplot(cut, data = diamonds) +
# add frequencies of each type of `cut`
geom_text(stat='count',aes(label=..count..),vjust=-1) +
# change font size
theme(text=element_text(size=14)) +
# change color
geom_bar(fill='lightsteelblue4') +
# expand y axis
ylim(0,25000) +
# add title
ggtitle("Quality of cut") +
# center title
theme(plot.title = element_text(hjust = 0.5)) +
# change theme
theme_minimal()
barcut
qplot(cut, data=diamonds) +
# flip the axes
coord_flip() +
# add frequencies of each type of `cut
geom_text(stat='count',aes(label=..count..),hjust=-0.25, vjust = 1) +
# expand y axis
ylim(0,25000) +
# change font size
theme(text=element_text(size=14)) +
# change color
geom_bar(fill='lightsteelblue4') +
# add title
ggtitle("Quality of cut") +
# center title
theme(plot.title = element_text(hjust = 0.5)) +
# change theme
theme_minimal()
pricehist(diamonds$price)
qplot(price,data=diamonds)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(price, geom='density', data=diamonds)
qplot(price, geom='density', data=diamonds) + geom_density(fill='lightsteelblue4')
price with density plot overlaid using ggplot()ggplot(diamonds, aes(x=price)) +
geom_histogram(aes(y=..density..),bins=40,color="black", fill="white")+
geom_density(alpha=.4, fill="lightsteelblue4") #value of alpha controls the level of transparency
cut and colorq1 = qplot(x=cut, fill=color, data=diamonds) #within each type of cut, display proportion of color
q1
q1 = q1 + ylim(0,25000)
q1
q1 = q1 + coord_flip()
q1
scale_fill_brewerq1 = q1 + scale_fill_brewer(palette="Set3")
q1
Additional info here
row_pct <- diamonds %>%
group_by(cut) %>%
count(color) %>%
mutate(percent = (n/sum(n)) * 100,
label = sprintf("%0.0f%%", percent)) # using "%0.1f%%" rounds to one decimal place
row_pct
## # A tibble: 35 x 5
## # Groups: cut [5]
## cut color n percent label
## <ord> <ord> <int> <dbl> <chr>
## 1 Fair D 163 10.1 10%
## 2 Fair E 224 13.9 14%
## 3 Fair F 312 19.4 19%
## 4 Fair G 314 19.5 20%
## 5 Fair H 303 18.8 19%
## 6 Fair I 175 10.9 11%
## 7 Fair J 119 7.39 7%
## 8 Good D 662 13.5 13%
## 9 Good E 933 19.0 19%
## 10 Good F 909 18.5 19%
## # ... with 25 more rows
ggplot(diamonds) +
aes(x=cut,fill=color) +
geom_bar(position="fill") +
geom_text(data=row_pct, aes(y=n,label=label),position=position_fill(vjust = 0.5), size = 3) +
scale_fill_brewer(palette="Set3") +
coord_flip() +
ggtitle("Proportions of Color Within Cut")
tab = table(diamonds$cut, diamonds$color)
addmargins(tab) # adds row sums and column sums
##
## D E F G H I J Sum
## Fair 163 224 312 314 303 175 119 1610
## Good 662 933 909 871 702 522 307 4906
## Very Good 1513 2400 2164 2299 1824 1204 678 12082
## Premium 1603 2337 2331 2924 2360 1428 808 13791
## Ideal 2834 3903 3826 4884 3115 2093 896 21551
## Sum 6775 9797 9542 11292 8304 5422 2808 53940
(round(prop.table(tab,1),3))*100
##
## D E F G H I J
## Fair 10.1 13.9 19.4 19.5 18.8 10.9 7.4
## Good 13.5 19.0 18.5 17.8 14.3 10.6 6.3
## Very Good 12.5 19.9 17.9 19.0 15.1 10.0 5.6
## Premium 11.6 16.9 16.9 21.2 17.1 10.4 5.9
## Ideal 13.2 18.1 17.8 22.7 14.5 9.7 4.2
(round(prop.table(tab,2),3))*100
##
## D E F G H I J
## Fair 2.4 2.3 3.3 2.8 3.6 3.2 4.2
## Good 9.8 9.5 9.5 7.7 8.5 9.6 10.9
## Very Good 22.3 24.5 22.7 20.4 22.0 22.2 24.1
## Premium 23.7 23.9 24.4 25.9 28.4 26.3 28.8
## Ideal 41.8 39.8 40.1 43.3 37.5 38.6 31.9
chisq.test(tab)
##
## Pearson's Chi-squared test
##
## data: tab
## X-squared = 310.32, df = 24, p-value < 2.2e-16
cut and priceqplot(x=price, fill=cut, data=diamonds)
qplot(x=price, color=cut, geom='density', data=diamonds)
qplot(x=cut, y=price, geom='boxplot', data=diamonds) +
coord_flip() +
ggtitle("Boxplots of Price by Cut")
price depend on cut?price given cut don’t look normal (there is a long tail)hist(diamonds$price)
kruskal.test(price~cut, data=diamonds)
##
## Kruskal-Wallis rank sum test
##
## data: price by cut
## Kruskal-Wallis chi-squared = 978.62, df = 4, p-value < 2.2e-16
carat and priceqplot(x=carat, y=price, data=diamonds)
qplot(x=carat, y=price, data=diamonds) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
qplot(x=carat, y=price, data=diamonds) +
geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
qplot(x=log(carat), y=log(price), data=diamonds) +
geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
qplot(x=carat, y=price, color=cut, data=diamonds) +
ggtitle("Scatterplot of Carat and Price, by Type of Cut")
qplot(x=carat, y=price, color=cut, geom='smooth', data=diamonds) +
geom_point(alpha=0.02) + #alpha controls the transparency
ggtitle("Scatterplot of Carat and Price, with Smoothed Lines by Type of Cut")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
qplot(x=carat, y=price, facets=.~cut, data=diamonds) +
geom_point(shape = 21, fill = "lightgray", color = "black", size = 1.5) +
theme_bw() +
ggtitle("Scatterplots of Carat and Price, by Cut")
qplot(x=carat, y=price, data=diamonds) +
geom_smooth() +
facet_grid(~ cut)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
colorqplot(x=carat, y=price, color=cut, facets=color ~ ., data=diamonds)
claritycarat and price by cut (top panel), by clarity (right hand side panel).clarityqplot(x=carat, y=price, color=clarity, facets=color~cut, data=diamonds)
qplot(x=carat, y=price, color=depth, data=diamonds)
depth into 4 categories (defined by its quartiles)diamonds$depthcat = cut(diamonds$depth, breaks=quantile(diamonds$depth), include.lowest = TRUE)
qplot(x=carat, y=price, facets=.~depthcat, data=diamonds) # 4 panels where each panel is a quartile
graphics package, we can do that with par(mfrow=c( , ))price and a histogram of pricepar(mfrow=c(2,1))
boxplot(diamonds$price, horizontal = TRUE, main = "Boxplot of Price")
hist(diamonds$price, main="Histogram of Price", xlab="")
layout()par(mfrow=...) solution, layout() allows greater control of panel parts# Layout function to split the plotting window in base R
layout(mat = matrix(c(1,2),2,1, byrow=TRUE), height = c(1,8))
# Draw the boxplot and the histogram
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(diamonds$price, horizontal=TRUE, xaxt="n", col='lightsteelblue3', frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(diamonds$price, breaks=40, col='lightsteelblue3', border=T, main="",xlab="Price", xlim=c(min(diamonds$price),max(diamonds$price)))
# Add a vertical line indicating the median
abline(v = median(diamonds$price), col="black", lwd=3, lty=2)
grid.arrangepar(mfrow=c(,)) doesn’t work with ggplotgrid.arrange in library(gridExtra)library(gridExtra) #call in the necessary library
p1 = ggplot(diamonds, aes(x=price)) +
geom_boxplot() +
ggtitle("Boxplot of Price")
p2 = ggplot(diamonds, aes(x=price)) +
geom_histogram(aes(y=..density..),binwidth=150,color="black", fill="white")+
geom_density(alpha=.2)+
ggtitle("Histogram of price with density plot overlaid")
grid.arrange(p1, p2, nrow=2)