Advanced ggplotting requires many extra libraries. They are loaded as needed.
Start with loading some basic libraries
library(dplyr)
library(ggplot2)
Load data from ggplot2 package
data("midwest", package = "ggplot2")
str(midwest)
## Classes 'tbl_df', 'tbl' and 'data.frame': 437 obs. of 28 variables:
## $ PID : int 561 562 563 564 565 566 567 568 569 570 ...
## $ county : chr "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
## $ state : chr "IL" "IL" "IL" "IL" ...
## $ area : num 0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
## $ poptotal : int 66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
## $ popdensity : num 1271 759 681 1812 324 ...
## $ popwhite : int 63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
## $ popblack : int 1702 3496 429 127 547 50 1 111 16 16559 ...
## $ popamerindian : int 98 19 35 46 14 65 8 30 8 331 ...
## $ popasian : int 249 48 16 150 5 195 15 61 23 8033 ...
## $ popother : int 124 9 34 1139 6 221 0 84 6 1596 ...
## $ percwhite : num 96.7 66.4 96.6 95.3 90.2 ...
## $ percblack : num 2.575 32.9 2.862 0.412 9.373 ...
## $ percamerindan : num 0.148 0.179 0.233 0.149 0.24 ...
## $ percasian : num 0.3768 0.4517 0.1067 0.4869 0.0857 ...
## $ percother : num 0.1876 0.0847 0.2268 3.6973 0.1028 ...
## $ popadults : int 43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
## $ perchsd : num 75.1 59.7 69.3 75.5 68.9 ...
## $ percollege : num 19.6 11.2 17 17.3 14.5 ...
## $ percprof : num 4.36 2.87 4.49 4.2 3.37 ...
## $ poppovertyknown : int 63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
## $ percpovertyknown : num 96.3 99.1 95 98.5 82.5 ...
## $ percbelowpoverty : num 13.15 32.24 12.07 7.21 13.52 ...
## $ percchildbelowpovert: num 18 45.8 14 11.2 13 ...
## $ percadultpoverty : num 11.01 27.39 10.85 5.54 11.14 ...
## $ percelderlypoverty : num 12.44 25.23 12.7 6.22 19.2 ...
## $ inmetro : int 0 0 0 1 0 0 0 0 0 1 ...
## $ category : chr "AAR" "LHR" "AAR" "ALU" ...
A basic scatterplot with different colors for each state and size of points proportional to population density:
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=T)
Add pre-defined ranges for x and y axes, add proper titles etc
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=T) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot",
caption = "Source: midwest")
Encirlce certain special region in scatterplot using geom_encicle from ggalt package.
library(ggplot2)
library(ggalt) #for geom_encircle()
require(dplyr) #for filtering data
#the points that are going to be encircled
midwest_select <- midwest %>% filter(poptotal > 350000 &
poptotal <= 500000 &
area > 0.01 &
area < 0.1)
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) + # draw points
geom_smooth(method="loess", se=T) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) + # draw smoothing line
geom_encircle(aes(x=area, y=poptotal),
data=midwest_select,
color="green",
size=2, # line thickness
expand=0.06) + # encircle
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot + Encircle",
caption="Source: midwest")
A Scatterplot where size of points show the extent of overlapping of data. geom_count() is a variant of geom_point():
data(mpg, package="ggplot2")
ggplot(data=mpg, mapping=aes(x=cty, y=hwy)) +
geom_count(col="tomato3", show.legend=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Counts Plot")
Lets plot the Same data using geom_count. Notice that the overlapping nature of points is not captured at all(can be done using geom_jitter)
data(mpg, package="ggplot2")
ggplot(data=mpg, mapping = aes(x=cty, y=hwy))+
geom_point(col="tomato2", show.legend = F)+
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="geom_point Plot")
Used to show relationship + distribution in the same plot
library(ggExtra)
data(mpg, package="ggplot2")
mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) +
geom_count(show.legend = F) +
geom_smooth(method="lm", se=F)
ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "density", fill="transparent")
Examine the corellation of multiple continuous variables present in the same dataframe. Lets plot a basic one
library(ggplot2)
library(ggcorrplot)
# Correlation matrix
data(mtcars)
#basic plot
ggcorrplot(cor(mtcars), type="lower", lab=T,
method="circle",
lab_size = 3)
A decorated plot
library(ggplot2)
library(ggcorrplot)
# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
Diverging Bars is a bar chart that can handle both negative and positive values A z-score (aka, a standard score) indicates how many standard deviations an element is from the mean. A z-score can be calculated from the following formula.
z = (X - u) / d
where z is the z-score, X is the value of the element, u is the population mean, and d is the standard deviation
If the number of elements in the set is large, about 68% of the elements have a z-score between -1 and 1; about 95% have a z-score between -2 and 2; and about 99% have a z-score between -3 and 3.
data("mtcars") # load data
#Pick rownames/car names
mtcars$`car name` <- rownames(mtcars) # create new column for car names
# Normalize/standardize data
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2)
#add new factor 'above' and 'below' for above/below average mileage
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above")
#sort data based on mpg_z score
mtcars <- mtcars[order(mtcars$mpg_z), ]
# convert to factor to retain sorted order in plot.
mtcars$`car name` <- factor(mtcars$`car name`, levels = mtcars$`car name`)
# Diverging Bar chart
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
# change default bar thickness from 1 to 0.5
# geom_bar() with stat=identity is geom_col
geom_col(aes(fill=mpg_type), width=.5) +
#display mpg_z value for each bar in black
geom_text(color="black", size=3)+
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="green", "below"="tomato")) +
labs(subtitle="Normalised mileage from 'mtcars'",
title= "Diverging Bars") +
#flip co-ordinate to show car names clearly/horizontally
coord_flip()
Ordered Bar Chart is a Bar Chart that is ordered by the Y axis variable
cty_mpg <- aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean) # aggregate
colnames(cty_mpg) <- c("make", "mileage") # change column names
cty_mpg <- cty_mpg[order(cty_mpg$mileage), ] # sort
# to retain the order in plot, char must be converted to factor
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make)
# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
#geom_bar(stat="identity", width=.5, fill="tomato3") +
#or
geom_col(width=.5, fill="tomato3")+
labs(title="Ordered Bar Chart",
subtitle="Make Vs Avg. Mileage",
x="Make",
y="Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.7))
Or this can be achieved using dplyr and forcats:fct_order() method as shown below. Also show text value or each bar
require(forcats) #for factor re-ordering
## Loading required package: forcats
mpg %>%
group_by(manufacturer) %>%
summarise(Mileage=mean(cty)) %>%
#convert to factor to maintain sorted ordering
#mutate(Make2=factor(manufacturer, levels = manufacturer)) %>%
#mutate(Make=fct_reorder(Make2, Mileage))%>%
#OR
#convert to factor and maintain re-ordering using
#forcats:fct_order method at one go
mutate(Make=fct_reorder(manufacturer, Mileage))%>%
ggplot(mapping=aes(x=Make, y=Mileage))+
geom_col(width=0.5, fill="tomato2")+
labs(title="Ordered bar chart",
subtitle="Make vs Avg. Mileage",
x="Make",
y="Mileage") +
#print value for each bar as well
geom_text(color="black", size=4, vjust=-0.5,
aes(label=sprintf("%0.1f", round(Mileage, digits = 2))))+
theme(axis.text.x = element_text(angle = 65, vjust=0.7))
Lollipop chart shows the same information as bar chart and diverging bar.
ggplot(mtcars, aes(x=`car name`, y=mpg_z, label=mpg_z)) +
geom_point(stat='identity', fill="black", size=6) +
geom_segment(aes(y = 0,
x = `car name`,
yend = mpg_z,
xend = `car name`),
color = "black") +
geom_text(color="white", size=2) +
labs(title="Diverging Lollipop Chart",
subtitle="Normalized mileage from 'mtcars': Lollipop") +
ylim(-2.5, 2.5) +
coord_flip()