Welcome to Ed’s notes on data visualisation - a kitchen sink of principles visualisations for knowing what is possible and plausible.
There are a couple of books that were referenced in taking these notes
Most content is sourced from variation on The R Graph Gallery
The excellent Data to Viz helps in selecting the right type based on types and combinations of variables.
For consistent styling reference the BBC Visual and Data Journalism Cookbook
These are the components of a visualision:
facet_wrap() for single categorical variable and facet_grid() for combination of two categorical variables in a grid.e.g. in a scatterplot the position of a glyph in both horizontal and vertical senses are the visual clues that help viewer understand how big they are. The aesthetic is the mapping that defines these correspondences.
A data table provides the basis for drawing a data graphic. Each case in the data table becomes a mark in the graph.
table of canonical data graphics
These are useful for understanding how a single variable is distribution. Never use pie charts as human readers cannot interpret angles
g <- ggplot(data = SAT_2010, aes(x = math))
g + geom_histogram(binwidth = 10)
Univariate numerical charts
# adjust changes the bandwidth used by the kernal smoother
# position and direction in a cartesian plane with a horizontal scale defined by data units
g + geom_density(adjust = 0.3)
If the variable is categorical we use bar graphs to display distributions of that variable.
ggplot(data = head(SAT_2010, 10), aes(x = reorder(state, math), y = math)) +
geom_bar(stat = "identity")
Here is another example of a categorical variable this time using color and flipping the coordinates
ggplot(data = HELPrct, aes(x = homeless)) +
geom_bar(aes(fill = substance), position = "fill") +
coord_flip()
A stacked bar plot showing the distribution of substance of abuse for participants in the HELP study
A table is a form of visualisation, we use kable to make pretty tables
av_life_exp <- gapminder %>%
filter(year == 1997) %>%
group_by(continent) %>%
summarise(avg_life_exp = mean(lifeExp))
## `summarise()` ungrouping output (override with `.groups` argument)
kable(
av_life_exp,
col.names = c('Continent',
'Average Life Expectancy'),
align = 'cc',
caption = 'Average Life Expectancy for each continent in 1997'
)
| Continent | Average Life Expectancy |
|---|---|
| Africa | 53.59827 |
| Americas | 71.15048 |
| Asia | 68.02052 |
| Europe | 75.50517 |
| Oceania | 78.19000 |
Most basic form used for looking at distribution of a numerical variable
data = data.frame(value = rnorm(100))
data %>%
ggplot(aes(x = value)) +
geom_histogram()
A Basic Histogram
Shows two distributions for variables
data <-
data.frame(type = c(rep("variable 1", 100), rep("variable 2", 100)),
value = c(rnorm(100), rnorm(100, mean = 4)))
data %>%
ggplot(aes(x = value, fill = type)) +
geom_histogram(color = "#e9ecef",
alpha = 0.6,
position = 'identity') +
scale_fill_manual(values = c("#69b3a2", "#404080")) +
labs(fill = "", title = "Histogram of two variables")
Mirror Histograms you see distribution of two variables.
data <- data.frame(var1 = rnorm(50),
var2 = rnorm(50, mean = 2))
data %>% ggplot(aes(x = x)) +
geom_density(aes(x = var1, y = ..density..), fill = "#69b3a2") +
geom_label(aes(x = 4.5, y = 0.25, label = "variable1"), color = "#69b3a2") +
geom_density(aes(x = var2, y = -..density..), fill = "#404080") +
geom_label(aes(x = 4.5, y = -0.25, label = "variable2"), color = "#404080") +
xlab("value of x")
These allow you to show distribution of values on the top and side of a point of line graph
p <- ggplot(mtcars, aes(
x = wt,
y = mpg,
color = cyl,
size = cyl
)) +
geom_point() +
theme(legend.position = "none")
ggMarginal(p, type = "histogram")
ggMarginal(p,
type = "histogram",
fill = "slateblue",
xparams = list(bins = 10))
ggMarginal(p, type = "boxplot")
ggMarginal(p,
margins = 'x',
color = "purple",
size = 4)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl) +
labs(
Title = "Multiple facets for displ vs hwy",
caption = "Source: mtcars"
)
Correlation is often computed as part of descriptive statistics in order to study the relationship between two variables.
The most simple way to do this is with a correlation matrix
dat <- mtcars[, c(1, 3:7)]
round(cor(dat), 2)
## mpg disp hp drat wt qsec
## mpg 1.00 -0.85 -0.78 0.68 -0.87 0.42
## disp -0.85 1.00 0.79 -0.71 0.89 -0.43
## hp -0.78 0.79 1.00 -0.45 0.66 -0.71
## drat 0.68 -0.71 -0.45 1.00 -0.71 0.09
## wt -0.87 0.89 0.66 -0.71 1.00 -0.17
## qsec 0.42 -0.43 -0.71 0.09 -0.17 1.00
A better visual way of looking at this is with a correlation plot.
dat <- mtcars[, c(1, 3:7)]
corrplot2 <- function(data,
method = "pearson",
sig.level = 0.05,
order = "original",
diag = FALSE,
type = "upper",
tl.srt = 90,
number.font = 1,
number.cex = 1,
mar = c(0, 0, 0, 0)) {
data_incomplete <- data
data <- data[complete.cases(data),]
mat <- cor(data, method = method)
cor.mtest <- function(mat, method) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], method = method)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
p.mat <- cor.mtest(data, method = method)
col <-
colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(
mat,
method = "color",
col = col(200),
number.font = number.font,
mar = mar,
number.cex = number.cex,
type = type,
order = order,
addCoef.col = "black",
tl.col = "black",
tl.srt = tl.srt,
p.mat = p.mat,
sig.level = sig.level,
insig = "blank",
diag = diag
)
}
corrplot2(
data = dat,
method = "pearson",
sig.level = 0.05,
order = "original",
diag = FALSE,
type = "upper",
tl.srt = 75
)
Shows the evolution of one or more numeric variables. Requires
ggplot(mtcars, aes(wt, mpg)) +
geom_line()
# ggplot(mtcars, aes(wt, mpg, color=as.factor(cyl_discr))) +
# geom_point(alpha = 0.4, size = 2) +
# geom_line(linetype="dashed") +
# geom_smooth(method = "lm", se = FALSE) +
# geom_smooth(aes(group = 1), method = "lm", se = FALSE, linetype="dashed", color="gray") +
# labs(
# title = "Multiple lines by factor with a global line"
# ) +
# theme_minimal()
dat <-
read.table(
"https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/3_TwoNumOrdered.csv",
header = T
)
dat$date <- as.Date(dat$date)
dat %>%
tail(10) %>%
ggplot(aes(x = date, y = value)) +
geom_line() +
geom_point()
data <-
read.table(
"https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/3_TwoNumOrdered.csv",
header = T
)
data$date <- as.Date(data$date)
data %>%
tail(10) %>%
ggplot(aes(x = date, y = value)) +
geom_line(color = "grey") +
geom_point(
shape = 21,
color = "black",
fill = "#69b3a2",
size = 6
) +
theme_minimal() +
ggtitle("Evolution of bitcoin price")
data <- babynames %>%
filter(name %in% c("Ashley", "Amanda")) %>%
filter(sex=="F") %>%
filter(year>1970) %>%
select(year, name, n) %>%
spread(key = name, value=n, -1)
# Select a few date to label the chart
tmp_date <- data %>% sample_frac(0.3)
# plot
data %>%
ggplot(aes(x=Amanda, y=Ashley, label=year)) +
geom_point(color="#69b3a2") +
geom_text_repel(data=tmp_date) +
geom_segment(color="#69b3a2",
aes(
xend=c(tail(Amanda, n=-1), NA),
yend=c(tail(Ashley, n=-1), NA)
),
arrow=arrow(length=unit(0.3,"cm"))
) +
theme_clean()
## Warning: Removed 1 rows containing missing values (geom_segment).
You can introduce a categorical variable as well for multiple lines
don <- babynames %>%
filter(name %in% c("Ashley", "Patricia", "Helen")) %>%
filter(sex == "F")
don %>%
ggplot(aes(
x = year,
y = n,
group = name,
color = name
)) +
geom_line() +
scale_color_viridis(discrete = TRUE) +
ggtitle("Popularity of American names in the previous 30 years") +
theme_minimal() +
ylab("Number of babies born")
The rug part is used on the axes to show the frequency of data points.
ggplot(data=iris, aes(x=Sepal.Length, Petal.Length)) +
geom_point() +
geom_rug(col="red",alpha=0.1, size=1.5)
We can create correlations for each grouped variable and for the dataset as a whole and modify the legend to show this.
Note that the default span for LOESS is 0.9. Lower spans give a better fit.
ggplot(mtcars, aes(x = wt, y = mpg, col = factor(cyl))) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, span = 0.7) +
geom_smooth(method = "loess",
aes(group = 1, col="All"),
se = FALSE, span = 0.7) +
# Add correct arguments to scale_color_manual
scale_color_manual("Cylinders", values = c("red", "blue", "green", 'black')) +
labs(
title = 'MPG vs weight for various cyclinder number',
x = 'Weight',
y = 'Miles per gallon'
)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#Prepare data
multiple_line_df <- gapminder %>%
filter(country == "China" | country == "United States")
#Make plot
multiple_line <- ggplot(multiple_line_df, aes(x = year, y = lifeExp, colour = country)) +
geom_line(size = 1) +
geom_hline(yintercept = 0, size = 1, colour="#333333") +
scale_colour_manual(values = c("#FAAB18", "#1380A1")) +
bbc_style() +
labs(title="Living longer",
subtitle = "Life expectancy in China and the US")
multiple_line + geom_hline(yintercept = 10, size = 1, colour = "red", linetype = "dashed")
#Prepare data
bar_df <- gapminder %>%
filter(year == 2007 & continent == "Africa") %>%
arrange(desc(lifeExp)) %>%
head(5)
#Make plot
ggplot(bar_df, aes(x = country, y = lifeExp)) +
geom_bar(stat="identity",
position="identity",
fill="#1380A1") +
geom_hline(yintercept = 0, size = 1, colour="#333333") +
bbc_style() +
labs(title="Reunion is highest",
subtitle = "Highest African life expectancy, 2007")
## Horizontal bar chart
We use coord_flip() to achieve this. We often want to hide the title of the y axis as well by calling theme(axis.title.y = element_blank()) at the end.
ggplot(diamonds) +
geom_bar(aes(cut, stat(prop), group = 1)) +
coord_flip() +
labs(
title = "Proportion of diamonds by cut",
y = "Proportion",
caption = "Source: Tidyverse Data"
) +
theme_tufte() +
theme(axis.title.y = element_blank())
stacked_df <- gapminder %>%
filter(year == 2007) %>%
mutate(lifeExpGrouped = cut(lifeExp,
breaks = c(0, 50, 65, 80, 90),
labels = c("Under 50", "50-65", "65-80", "80+"))) %>%
group_by(continent, lifeExpGrouped) %>%
summarise(continentPop = sum(as.numeric(pop)))
## `summarise()` regrouping output by 'continent' (override with `.groups` argument)
stacked_df$lifeExpGrouped = factor(stacked_df$lifeExpGrouped, levels = rev(levels(stacked_df$lifeExpGrouped)))
ggplot(data = stacked_df,
aes(x = continent,
y = continentPop,
fill = lifeExpGrouped)) +
geom_bar(stat = "identity",
position = "fill") +
bbc_style() +
scale_y_continuous(labels = scales::percent) +
scale_fill_viridis_d(direction = -1) +
geom_hline(yintercept = 0, size = 1, colour = "#333333") +
labs(title = "How life expectancy varies",
subtitle = "% of population by life expectancy band, 2007") +
theme(legend.position = "top",
legend.justification = "left") +
guides(fill = guide_legend(reverse = TRUE))
grouped_bar_df <- gapminder %>%
filter(year == 1967 | year == 2007) %>%
select(country, year, lifeExp) %>%
spread(year, lifeExp) %>%
mutate(gap = `2007` - `1967`) %>%
arrange(desc(gap)) %>%
head(5) %>%
gather(key = year,
value = lifeExp,
-country,
-gap)
ggplot(grouped_bar_df,
aes(x = country,
y = lifeExp,
fill = as.factor(year))) +
geom_bar(stat="identity", position="dodge") +
geom_hline(yintercept = 0, size = 1, colour="#333333") +
bbc_style() +
scale_fill_manual(values = c("#1380A1", "#FAAB18")) +
labs(title="We're living longer",
subtitle = "Biggest life expectancy rise, 1967-2007")
## Lollipop Chart
# Prepare data: group mean city mileage by manufacturer.
cty_mpg <- aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean) # aggregate
colnames(cty_mpg) <- c("make", "mileage") # change column names
cty_mpg <- cty_mpg[order(cty_mpg$mileage), ] # sort
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make) # to retain the order in plot.
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_point(size=3) +
geom_segment(aes(x=make,
xend=make,
y=0,
yend=mileage)) +
labs(title="Lollipop Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
theme_bw()
ggplot(data = diamonds) +
geom_bar(aes(cut, stat(prop), group = 1)) +
theme_minimal() +
labs(
title = "Breakdown of diamonds by cut",
x = "Cut Type",
y = "Proportion",
caption = "Source: tidyverse"
)
ggplot(diamonds) +
stat_summary(
mapping = aes(x = cut, y = depth),
fun.ymin = min,
fun.ymax = max,
fun.y = median
)
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: `fun.ymin` is deprecated. Use `fun.min` instead.
## Warning: `fun.ymax` is deprecated. Use `fun.max` instead.
#Prepare data
dumbbell_df <- gapminder %>%
filter(year == 1967 | year == 2007) %>%
select(country, year, lifeExp) %>%
spread(year, lifeExp) %>%
mutate(gap = `2007` - `1967`) %>%
arrange(desc(gap)) %>%
head(10)
#Make plot
ggplot(dumbbell_df, aes(x = `1967`, xend = `2007`, y = reorder(country, gap), group = country)) +
geom_dumbbell(colour = "#dddddd",
size = 3,
colour_x = "#FAAB18",
colour_xend = "#1380A1") +
bbc_style() +
labs(title="We're living longer",
subtitle="Biggest life expectancy rise, 1967-2007")
A 2d density plot is useful to study the relationship between 2 numeric variables if you have a huge number of points.
To avoid overlapping (as in the scatterplot beside), it divides the plot area in a multitude of small fragment and represents the number of points in this fragment.
a <- data.frame( x=rnorm(20000, 10, 1.9), y=rnorm(20000, 10, 1.2) )
b <- data.frame( x=rnorm(20000, 14.5, 1.9), y=rnorm(20000, 14.5, 1.9) )
c <- data.frame( x=rnorm(20000, 9.5, 1.9), y=rnorm(20000, 15.5, 1.9) )
data <- rbind(a,b,c)
# Basic scatterplot
ggplot(data, aes(x=x, y=y) ) +
geom_point()
And this version with and without fill
p1 <- ggplot(data, aes(x=x, y=y) ) +
geom_bin2d() +
theme_bw()
p2 <- ggplot(data, aes(x=x, y=y) ) +
geom_bin2d(bins = 70) +
scale_fill_continuous(type = "viridis") +
theme_bw()
p1 + p2 + plot_layout(ncol=2, guides="collect") & theme(legend.position = 'bottom')
data <- data.frame(Year=rep(c("2014","2015","2016","2017"), times=3, each=1),
Category=rep(c("One","Two","Three"), times=1, each=4),
Value=c(55,62,70,73,30,39,41,56,15,22,26,30),
Width=rep(c(0.9,0.7,0.5), times=1, each=4)
)
data$Category <- factor(data$Category, levels = c("One", "Two", "Three"))
gg <- ggplot(data)
gg <- gg + geom_bar(aes(Year, Value, fill=Category), width=data$Width, stat="identity")
gg <- gg + scale_fill_manual("Key", values = c("One" = "#121c23", "Two" = "#3d5f77", "Three" = "#7db4db"))
gg <- gg + xlab("") + ylab("")
gg <- gg + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line.x = element_line(color="#919191", size = 0.1),
axis.line.y = element_line(color="#919191", size = 0.1))
gg
## Warning: position_stack requires non-overlapping x intervals
A scatterplot where a third dimension is added - the value of this additional numerical variable is represented through size.
data <- gapminder %>% filter(year == "2007") %>% dplyr::select(-year)
ggplot(data, aes(gdpPercap, lifeExp, size=pop)) +
geom_point(alpha=0.7)
Then control circle size with scale_size()
data %>%
arrange(desc(pop)) %>%
mutate(country = factor(country, country)) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size = pop)) +
geom_point(alpha=0.5) +
scale_size(range = c(.1, 24), name="Population (M)")
Adding a fourth dimension with color.
data %>%
arrange(desc(pop)) %>%
mutate(country = factor(country, country)) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, color=continent)) +
geom_point(alpha=0.5) +
scale_size(range = c(.1, 24), name="Population (M)")
Improve the look of the vis by:
data %>%
arrange(desc(pop)) %>%
mutate(country = factor(country, country)) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, fill=continent)) +
geom_point(alpha=0.5, shape=21, color="black") +
scale_size(range = c(.1, 24), name="Population (M)") +
scale_fill_viridis(discrete=TRUE, guide=FALSE, option="A") +
theme_minimal() +
theme(legend.position="none") +
ylab("Life Expectancy") +
xlab("Gdp per Capita")
Now let’s make a version that is interactive by using ggplotly()
data <- gapminder %>% filter(year=="2007") %>% dplyr::select(-year)
p <- data %>%
mutate(gdpPercap=round(gdpPercap,0)) %>%
mutate(pop=round(pop/1000000,2)) %>%
mutate(lifeExp=round(lifeExp,1)) %>%
arrange(desc(pop)) %>%
mutate(country = factor(country, country)) %>%
mutate(text = paste("Country: ", country, "\nPopulation (M): ", pop, "\nLife Expectancy: ", lifeExp, "\nGdp per capita: ", gdpPercap, sep="")) %>%
ggplot( aes(x=gdpPercap, y=lifeExp, size = pop, color = continent, text=text)) +
geom_point(alpha=0.7) +
scale_size(range = c(1.4, 19), name="Population (M)") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme_minimal() +
theme(legend.position="none")
pp <- ggplotly(p, tooltip="text")
pp
# save the widget
# library(htmlwidgets)
# saveWidget(pp, file=paste0( getwd(), "/HtmlWidget/ggplotlyBubblechart.html"))
library(chorddiag)
# Create dummy data
m <- matrix(c(11975, 5871, 8916, 2868,
1951, 10048, 2060, 6171,
8010, 16145, 8090, 8045,
1013, 990, 940, 6907),
byrow = TRUE,
nrow = 4, ncol = 4)
# A vector of 4 colors for 4 groups
haircolors <- c("black", "blonde", "brown", "red")
dimnames(m) <- list(have = haircolors,
prefer = haircolors)
groupColors <- c("#000000", "#FFDD89", "#957244", "#F26223")
# Build the chord diagram:
p <- chorddiag(m, groupColors = groupColors, groupnamePadding = 20)
p
# save the widget
# library(htmlwidgets)
# saveWidget(p, file=paste0( getwd(), "/HtmlWidget/chord_interactive.html"))
df <- expand.grid(x = 1:10, y=1:10)
df$angle <- runif(100, 0, 2*pi)
df$speed <- runif(100, 0, sqrt(0.1 * df$x))
ggplot(df, aes(x, y)) +
geom_point() +
geom_spoke(aes(angle = angle), radius = 0.5)
ggplot(df, aes(x, y)) +
geom_point() +
geom_spoke(aes(angle = angle, radius = speed))
Always important to make sure date variables can be read properly. Use the lubridate package.
time <- c("2019-04-17", "2019-03-21")
ymd(time)
## [1] "2019-04-17" "2019-03-21"
prices <- crypto_prices
#tsviz()
We shall look at a popular time series set of data of energy demand in megawatts. Samply interval of 30 minutes
str(taylor_30_min)
## tibble [4,032 × 2] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:4032], format: "2000-06-05 00:00:00" "2000-06-05 00:30:00" ...
## $ value: num [1:4032] 22262 21756 22247 22759 22549 ...
Use the plot_time_series() function which uses plotly by default. We shall set ’‘’interactive’’’ variable to true globally so that we get plotly with interaction.
plot_int = TRUE
This is the most basic plot
taylor_30_min %>%
plot_time_series(.date_var = date, .value = value,
.interactive = plot_int,
.plotly_slider = TRUE)
### Grouped time series
m4_daily is a sample of 4 time series from the M4 competition sampled daily.
group_by or using .. o addd groups.facet_ncol = 2 return a 2-column faceted plot.facet_scales = "free" allows the x and y-xis of each plot to scle independently.m4_daily %>%
group_by(id) %>%
plot_time_series(.date_var = date, .value = value,
.facet_ncol = 2, .facet_scales = "free", .interactive = plot_int)
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
The blue line is called the smoother. We can adjust smoother using
.smooth.smooth_period = "52 weeks" (30 days of data) or .smooth_span = 0.2 (25% of the data).m4_weekly %>%
group_by(id) %>%
plot_time_series(date, value,
# Smoother
.smooth = TRUE,
.smooth_span = 0.25, # <- Uses % of data
# .smooth_period = "52 weeks", # <- Uses windows of data
.facet_ncol = 2, .facet_scales = "free",
.interactive = plot_int)
Now we shall use an hourly dataset and show
.color_var to highligh subgroupsHighlighting subgroups in facetet plots but highlight weekly windows (weekly subplots) using week()
m4_hourly %>%
group_by(id) %>%
plot_time_series(date, log(value), # Apply a Log Transformation
.color_var = week(date), # Color applied to Week transformation
# Facet formatting
.facet_ncol = 2,
.facet_scales = "free",
.interactive = plot_int)
taylor_30_min %>%
plot_time_series(date, value,
.color_var = month(date, label = TRUE),
.interactive = FALSE, # <- Returns static ggplot
# Customization
.title = "Taylor's MegaWatt Data",
.x_lab = "Date (30-min intervals)",
.y_lab = "Energy Demand (MW)",
.color_lab = "Month") +
scale_y_continuous(labels = scales::comma_format())
The presnece of correlation that is connected to lagged version of a time series. Means that past history is related to future history
plot_int = FALSE
taylor_30_min %>%
plot_time_series(date, value, .interactive = plot_int)
Now visualise autocorrelation using a new function
taylor_30_min %>%
plot_acf_diagnostics(date, value, .interactive = plot_int)
We can also do grouped ACF and PACFs
m4_hourly %>%
group_by(id) %>%
plot_acf_diagnostics(
date, value, # ACF & PACF
.lags = "14 days", # 14-Days of hourly lags
.interactive = FALSE
)
Grouped analytics highligh similarities and differences between series. We can see here that H150 and H410 have spikes at 1 week in addtion to the daily frequency
The most basic form is to use a line graph with the date on the x-axis
data <- data.frame(
day = as.Date("2017-06-14") - 0:364,
value = runif(365) + seq(-140, 224)^2 / 10000
)
# Most basic bubble plot
p <- ggplot(data, aes(x=day, y=value)) +
geom_line() +
xlab("")
p
Once we have the time variable as a ’‘’date’’’ we can make use of the ’‘’scale_x_date()’’’ function.
For example
p+scale_x_date(date_labels = "%B")
p+scale_x_date(date_labels = "%b %d %Y")
We can use date_breaks and date_minor_breaks to control the amount of break between ticks
Comare the following
p + scale_x_date(date_breaks = "1 week", date_labels = "%W")
p + scale_x_date(date_minor_breaks = "2 day")
data <- data.frame(
day = as.Date("2017-06-14") - 0:364,
value = runif(365) - seq(-140, 224)^2 / 10000
)
# Most basic bubble plot
p <- ggplot(data, aes(x=day, y=value)) +
geom_line( color="#69b3a2") +
xlab("") +
theme_minimal() +
theme(axis.text.x=element_text(angle=60, hjust=1))
p
Usee limit option of scale_x_date() in order to select a time frame in the data
data <- data.frame(
day = as.Date("2017-06-14") - 0:364,
value = runif(365) + seq(-140, 224)^2 / 10000
)
# Most basic bubble plot
p <- ggplot(data, aes(x=day, y=value)) +
geom_line( color="steelblue") +
geom_point(color="darkblue") +
xlab("") +
theme_minimal() +
theme(axis.text.x=element_text(angle=60, hjust=1)) +
scale_x_date(limit=c(as.Date("2017-01-01"),as.Date("2017-02-11"))) +
ylim(0,1.5)
p
## Warning: Removed 323 row(s) containing missing values (geom_path).
## Warning: Removed 323 rows containing missing values (geom_point).
options(scipen=999) # turn-off scientific notation like 1e+48
theme_set(theme_bw()) # pre-set the bw theme.
data("midwest", package = "ggplot2")
# midwest <- read.csv("http://goo.gl/G1K41K") # bkup data source
# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot",
caption = "Source: midwest")
plot(gg)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
data(mpg, package="ggplot2") # alternate source: "http://goo.gl/uEeRGu")
g <- ggplot(mpg, aes(cty, hwy))
g + geom_point() +
geom_smooth(method="lm", se=F) +
labs(subtitle="mpg: cty vs highway mileage",
y="hwy",
x="cty",
title="Scatterplot with overlapping points",
caption="Source: midwest") +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
There are actually lots of overlapping data points here, so jitter would be a better choice
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
g <- ggplot(mpg, aes(cty, hwy))
g + geom_jitter(width = .5, size=1) +
geom_smooth(method="lm", se=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Jittered Points") +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
ggplot(gapminder, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
geom_point() +
scale_x_log10() +
facet_wrap(~ year) +
labs(
title = "GDP per capita and life expectancy between 1952 and 2007",
caption = "Source: Gapminder"
) +
theme_minimal()
The default coordinate system is the Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful.
coord_flip() - swiches x and y axisggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot() +
labs(
title = "Too many x values"
)
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot() +
coord_flip() +
labs(
title = "Now looks better"
)
coord_quickmap()Useful for map data scaling
nz <- map_data("usa")
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", colour = "black")
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", colour = "black") +
coord_quickmap()
coord_polar()Interesting alternative to bar plots
bar <- ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = cut, fill = cut),
show.legend = FALSE,
width = 1
) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL)
bar + coord_flip()
bar + coord_polar()
We need to use the following library and keep a separate js file
devtools::install_github("rstudio/r2d3")
library(r2d3)
data <- data.frame(year=c(2011:2016), value=c(0.45, 0.47, 0.52, 0.7, 0.59, 0.47))
r2d3(data=data,
script = "./d3-bar-chart.js",
options = list(margin = 50,
barPadding = 0.1,
colour = "rgba(255,0,0,1)",
xLabel = "Year",
yLabel = "Value",
chartTitle = "Bar Chart with external D3 JS File"
)
)
The following will require a js file to actually render
//Set some initial values
var margin = options.margin,
barPadding = options.barPadding,
width = width-(2*margin),
height = height-(2*margin),
barWidth = Math.floor(width/data.length),
xmax = d3.max(data, function(d) { return d.year; }),
xmin = d3.min(data, function(d) { return d.year; }),
ymax = d3.max(data, function(d) { return d.value; });
//Create the chart
svg.selectAll('rect')
.data(data)
.enter()
.append('rect')
.attr('height', function(d) { return d.value/ymax * height; })
.attr('width', barWidth-barPadding)
.attr('x', function(d, i) { return (margin+(i * barWidth)); })
.attr('y', function(d) { return (height+margin-(d.value/ymax * height)); })
.attr('fill', options.colour)
//Hover effects
.on('mouseover', function (d, i) {
d3.select(this).transition()
.duration('50')
.attr('opacity', '.5')})
.on('mouseout', function (d, i) {
d3.select(this).transition()
.duration('300')
.attr('opacity', '1')})
//Create the x axis
var x = d3.scaleBand()
.domain(data.map(function(d) { return d.year; }))
.range([0, width-barPadding]);
svg.append("g")
.attr("transform", "translate(" + margin + "," + (height+margin) + ")")
.call(d3.axisBottom(x));
svg.append("text")
.attr("transform", "translate(" + (width/2) + " ," + (height+2*margin) + ")")
.attr("dx", "1em")
.style("text-anchor", "middle")
.style("font-family", "Tahoma, Geneva, sans-serif")
.style("font-size", "12pt")
.text(options.xLabel);
//Create the y axis
var y = d3.scaleLinear()
.range([height, 0])
.domain([0, ymax]);
svg.append("g")
.attr("transform", "translate(" + margin + ", " + margin + ")")
.call(d3.axisLeft(y));
svg.append("text")
.attr("transform", "translate(" + 0 + " ," + ((height+2*margin)/2) + ") rotate(-90)")
.attr("dy", "1em")
.style("text-anchor", "middle")
.style("font-family", "Tahoma, Geneva, sans-serif")
.style("font-size", "12pt")
.text(options.yLabel);
//Create the chart title
svg.append("text")
.attr("x", (width / 2))
.attr("y", (margin/2))
.attr("text-anchor", "middle")
.attr("dx", "1em")
.style("font-size", "16pt")
.style("font-family", "Tahoma, Geneva, sans-serif")
.text(options.chartTitle);
Animations is different to interactive
The key libraries are gganimate and gifski. It allows you to provide a frame (animation step) as another aesthetic to ggplot.
Important - on R Studio Server it is necessary to use webm, possibly important to install gifski as well.
A raw graph that we want to animate. It is a mess as we are missing the key year grouping.
p <- ggplot(gapminder,
aes(
x = gdpPercap,
y = lifeExp,
size = pop,
colour = country
)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
scale_color_viridis_d() +
scale_size(range = c(2, 12)) +
scale_x_log10() +
labs(
x = "GDP per capita",
y = "Life expectancy",
title = "GDP per capita vs Life Expectancy for\n global countries and continents",
alpha = 0.7,
size = 1.1
)
In our first version we animate by year.
# we add a transition by the year variable
anim <- p + transition_time(year) +
labs(title = "Year: {frame_time}")
# we animate it. Note: for dev purposes set the nframes count fairly low. 50 is the normal default.
animate(anim, nframes = 50, renderer = gifski_renderer())
In this version we facet by continent and also animate by year
continent <- p + facet_wrap(~continent) +
transition_time(year) +
labs(title = "Year: {frame_time}")
# note - this renders in the plots area
animate(continent, nframes = 50, renderer = gifski_renderer())
We can also get the x-axis to follow the view:
anim <- p + transition_time(year) +
labs(title = "Year: {frame_time}") +
view_follow(fixed_y = TRUE)
animate(anim, nframes = 20, renderer = gifski_renderer())
gapminder %>%
plot_ly(
x = ~gdpPercap,
y = ~lifeExp,
size = ~pop,
color = ~continent,
frame = ~year,
text = ~country,
hoverinfo = "text",
type = "scatter",
mode = "markers"
) %>%
layout(xaxis = list(type = "log"))
There are various rendering option for showing in Rstudio viewer vs html_notebook output.
This version only renders on html production
for (i in 1:2) {
pie(c(i %% 2, 6),
col = c('red', 'yellow'),
labels = NA)
}
gapminder_2007 <- gapminder %>%
filter(year == 2007)
ggplot(gapminder_2007, aes(x = continent, y = gdpPercap)) +
geom_boxplot() +
scale_y_log10() +
ggtitle("Comparing GDP Per Cap across continents")
There is also the very minimal Tufte version!
theme_set(theme_tufte())
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_tufteboxplot() +
theme(axis.text.x = element_text(angle = 65, vjust = 0.6)) +
labs(
title = "Tufte Styled Boxplot",
subtitle = "City Mileage grouped by Class of vehicle",
caption = "Source: top 50 ggplots",
x = "Class of Vehicle",
y = "City Mileage"
)
A bar chart that can handle both positive and negative values
data("mtcars")
mtcars$`car name` <-
rownames(mtcars) # create new column for car names
mtcars$mpg_z <-
round((mtcars$mpg - mean(mtcars$mpg)) / sd(mtcars$mpg), 2) # compute normalized mpg
mtcars$mpg_type <-
ifelse(mtcars$mpg_z < 0, "below", "above") # above / below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ] # sort
mtcars$`car name` <-
factor(mtcars$`car name`, levels = mtcars$`car name`) # convert to factor to retain sorted order in plot.
ggplot(mtcars, aes(x = `car name`, y = mpg_z, label = mpg_z)) +
geom_bar(stat = "identity", aes(fill = mpg_type), width = .5) +
scale_fill_manual(
name = "Mileage",
labels = c("Above Average", "Below Average"),
values = c("above" = "#00ba38", "below" = "#f8766d")
) +
labs(
subtitle = "Normalised mileage from 'mtcars'",
title = "Diverging Bars",
x = "Car Nae",
y = "Miles per gallon",
source = "Top 50 ggplot"
) +
coord_flip() +
theme_bw()
ggplot(mtcars, aes(x = `car name`, y = mpg_z, label = mpg_z)) +
geom_point(stat = 'identity', fill = "black", size = 5) +
geom_segment(aes(
y = 0,
x = `car name`,
yend = mpg_z,
xend = `car name`
),
color = "black") +
geom_text(color = "white", size = 2) +
labs(
title = "Diverging Lollipop Chart",
subtitle = "Normalized mileage from 'mtcars': Lollipop",
x = 'Miles per gallon',
y = 'Car name',
caption = 'Top 50 ggplot'
) +
ylim(-2.5, 2.5) +
coord_flip() +
theme_minimal()
The dot plot is similar
ggplot(cty_mpg, aes(x = make, y = mileage)) +
geom_point(col = "tomato2", size = 3) +
geom_segment(aes(
x = make,
xend = make,
y = min(mileage),
yend = max(mileage)
),
linetype = "dashed",
size = 0.1) +
labs(title = "Dot Plot",
subtitle = "Make Vs Avg. Mileage",
caption = "Source: Top 50 ggplots") +
coord_flip() +
theme_minimal()
popgrid.bolivia <- read.csv("./data/popgrid_bolivia.csv")
#View(popgrid.bolivia)
We can quickly plot using geom_point() and set the size to the population.
map <- ggplot(popgrid.bolivia, aes(x = x, y = y, size = population)) +
geom_point() +
theme_void()
map
Because the map’s main purpose is not to show exactly how many people live in a specific area, and because the legend hinders the alignment of the map and the histograms, you’re going to remove the map legend. You can give the the dots a nice color, and because some of the dots are overlapping, you can add some transparency. Finaly, use scale_size_area() to set the maximum size of the dots.
map <- ggplot(popgrid.bolivia, aes(x = x, y = y, size = population)) +
geom_point(color = "#7A0018", alpha = 0.8) +
theme_void() +
theme(legend.position = "none") +
scale_size_area(max_size = 10)
map
Next we want to add histogram axes to the graph. First we need to calculate the sum of each x and y combination.
bylong <- group_by(popgrid.bolivia, x) %>% summarise(total = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
colnames(bylong) <- c("long", "total")
bylat <- group_by(popgrid.bolivia, y) %>% summarise(total = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
colnames(bylat) <- c("lat", "total")
We then create the long aspect
long.histo <- ggplot(bylong, aes(y = total, x = long)) +
geom_col(fill = "#7A0018") +
theme_void() +
scale_y_reverse()
long.histo
and then the lat aspect
lat.histo <- ggplot(bylat, aes(y = total, x = lat)) +
geom_col(fill = "chocolate") +
theme_void() +
coord_flip()
lat.histo
We then use pathwork to knit the charts together into a single figure
pacman::p_load(
patchwork
)
historatio <- max(bylat$total)/max(bylong$total)
{map + lat.histo} + {long.histo} + plot_layout(ncol = 2, nrow = 2, widths = c(1,1), heights = c(1,historatio))
To slightly reduce the amount of space that the histograms taken up we can set a
histowidth variable
histowidth <- 0.5
{map + lat.histo} + {long.histo} + plot_layout(ncol = 2, nrow = 2, widths = c(1,histowidth), heights = c(1,historatio*histowidth))
So far we have not projected the map meaning that if we change dimensions of the chart the proportions wil look odd. So you need to make sure that that the aspect ratio of the map resembles the aspect ratio on a real map. With the following trick, you can just do that. You can calculate the width to height ratio for the projected map, and then use this ratio to set the dimensions of the map so that the shape of the country is not distorted in an exaggerated way.
coord <- coord_quickmap(xlim = range(popgrid.bolivia$x), ylim = range(popgrid.bolivia$y), expand = F)
map.aspect <- coord$aspect(list(x.range = range(popgrid.bolivia$x), y.range = range(popgrid.bolivia$y)))
{map + lat.histo} + {long.histo} + plot_layout(ncol = 2, nrow = 2, heights = c(1,histowidth), widths = c(1/map.aspect, histowidth/historatio))
The final step is to save the figure with a fixed outputfactor
outputfactor <- 10
ggsave("bolivia.png", units = "cm", height = outputfactor*(1 + histowidth), width = outputfactor*(1/map.aspect + histowidth/historatio))
ggmaps is a great library for adapting ggplot2 to allow inclusiong of map layers from third parties.
Note: it requires registration with google to use
pacman::p_load(ggmap)
register_google(key = "[AIzaSyCdU_gwCrEyHFkd3Mhm7R51yiuTZj5YwnE]")
#library("ggmap")
us <- c(left = -125, bottom = 25.75, right = -67, top = 49)
st_map <-get_stamenmap(us, zoom = 5, maptype = "toner-lite") %>% ggmap()
## Source : http://tile.stamen.com/toner-lite/5/4/10.png
## Source : http://tile.stamen.com/toner-lite/5/5/10.png
## Source : http://tile.stamen.com/toner-lite/5/6/10.png
## Source : http://tile.stamen.com/toner-lite/5/7/10.png
## Source : http://tile.stamen.com/toner-lite/5/8/10.png
## Source : http://tile.stamen.com/toner-lite/5/9/10.png
## Source : http://tile.stamen.com/toner-lite/5/10/10.png
## Source : http://tile.stamen.com/toner-lite/5/4/11.png
## Source : http://tile.stamen.com/toner-lite/5/5/11.png
## Source : http://tile.stamen.com/toner-lite/5/6/11.png
## Source : http://tile.stamen.com/toner-lite/5/7/11.png
## Source : http://tile.stamen.com/toner-lite/5/8/11.png
## Source : http://tile.stamen.com/toner-lite/5/9/11.png
## Source : http://tile.stamen.com/toner-lite/5/10/11.png
## Source : http://tile.stamen.com/toner-lite/5/4/12.png
## Source : http://tile.stamen.com/toner-lite/5/5/12.png
## Source : http://tile.stamen.com/toner-lite/5/6/12.png
## Source : http://tile.stamen.com/toner-lite/5/7/12.png
## Source : http://tile.stamen.com/toner-lite/5/8/12.png
## Source : http://tile.stamen.com/toner-lite/5/9/12.png
## Source : http://tile.stamen.com/toner-lite/5/10/12.png
## Source : http://tile.stamen.com/toner-lite/5/4/13.png
## Source : http://tile.stamen.com/toner-lite/5/5/13.png
## Source : http://tile.stamen.com/toner-lite/5/6/13.png
## Source : http://tile.stamen.com/toner-lite/5/7/13.png
## Source : http://tile.stamen.com/toner-lite/5/8/13.png
## Source : http://tile.stamen.com/toner-lite/5/9/13.png
## Source : http://tile.stamen.com/toner-lite/5/10/13.png
st_map
### Map Scatterplot
pacman::p_load(
dplyr,
forcats
)
`%notin%` <- function(lhs, rhs) !(lhs %in% rhs)
violent_crimes <- crime %>%
filter(
offense %notin% c("auto theft", "theft", "burglary"),
-95.39681 <= lon & lon <= -95.34188,
29.73631 <= lat & lat <= 29.78400
) %>%
mutate(
offense = fct_drop(offense),
offense = fct_relevel(offense, c("robbery", "aggravated assault", "rape", "murder"))
)
# use qmplot to make a scatterplot on a map
plot <- qmplot(lon, lat, data = violent_crimes, maptype = "toner-lite", color = I("red"))
## Using zoom = 14...
## Source : http://tile.stamen.com/terrain/14/3850/6770.png
## Source : http://tile.stamen.com/terrain/14/3851/6770.png
## Source : http://tile.stamen.com/terrain/14/3852/6770.png
## Source : http://tile.stamen.com/terrain/14/3853/6770.png
## Source : http://tile.stamen.com/terrain/14/3850/6771.png
## Source : http://tile.stamen.com/terrain/14/3851/6771.png
## Source : http://tile.stamen.com/terrain/14/3852/6771.png
## Source : http://tile.stamen.com/terrain/14/3853/6771.png
## Source : http://tile.stamen.com/terrain/14/3850/6772.png
## Source : http://tile.stamen.com/terrain/14/3851/6772.png
## Source : http://tile.stamen.com/terrain/14/3852/6772.png
## Source : http://tile.stamen.com/terrain/14/3853/6772.png
## Source : http://tile.stamen.com/terrain/14/3850/6773.png
## Source : http://tile.stamen.com/terrain/14/3851/6773.png
## Source : http://tile.stamen.com/terrain/14/3852/6773.png
## Source : http://tile.stamen.com/terrain/14/3853/6773.png
plot
robberies <- violent_crimes %>% filter(offense == "robbery")
qmplot(lon, lat, data = violent_crimes, geom = "blank",
zoom = 14, maptype = "toner-background", darken = .7, legend = "topleft"
) +
stat_density_2d(aes(fill = ..level..), geom = "polygon", alpha = .3, color = NA) +
scale_fill_gradient2("Robbery\nPropensity", low = "white", mid = "yellow", high = "red", midpoint = 650)
# Map tiles by Stamen Design, under CC BY 3.0. Data by OpenStreetMap, under ODbL.
qmplot(lon, lat, data = violent_crimes, maptype = "toner-background", color = offense) +
facet_wrap(~ offense)
## Using zoom = 14...
Leaflet is a good starting point
This is the birthplace of R!
leaflet() %>%
addTiles() %>%
addMarkers(lng=174.768, lat=-36.852, popup="The birthplace of R")
The following packages are needed for this type of visualisation
pacman::p_load(
ggmap,
rgdal,
rgeos,
maptools,
dplyr,
tidyr,
tmap
)
We shall read some data about sports in london. It will return a SpatialPolygonsDataFrame
lnd <- readOGR(dsn = "./data", layer = "london_sport")
## OGR data source with driver: ESRI Shapefile
## Source: "/home/edwallitt/r-exploration/r-graphing-gallery/r-graphing-gallery/data", layer: "london_sport"
## with 33 features
## It has 4 fields
## Integer64 fields read as strings: Pop_2001
lnd$Pop_2001 <- as.numeric(as.character(lnd$Pop_2001))
This is a basic boundary plot of london
plot(lnd)
We can now highlight areas that have a variable value above a certain amount, and set colors for it.
plot(lnd, col = "lightgrey") # plot the london_sport object
sel <- lnd$Partic_Per > 25
plot(lnd[ sel, ], col = "turquoise", add = TRUE) # add selected zones to map
We create a data frame for all available EPSG code
EPSG <- make_EPSG() # create data frame of available EPSG codes
EPSG[grepl("WGS 84$", EPSG$note), ] # search for WGS 84 code
We now transform the london data and save it.
lnd84 <- spTransform(lnd, CRS("+init=epsg:4326"))
saveRDS(object = lnd84, file = "./data/lnd84.Rds")
# Create new object called "lnd" from "london_sport" shapefile
lnd <- readOGR(dsn = "./data", layer = "london_sport")
## OGR data source with driver: ESRI Shapefile
## Source: "/home/edwallitt/r-exploration/r-graphing-gallery/r-graphing-gallery/data", layer: "london_sport"
## with 33 features
## It has 4 fields
## Integer64 fields read as strings: Pop_2001
plot(lnd)
Now let’s grab some crime data for london boroughs.
crime_data <- read.csv("data/mps-recordedcrime-borough.csv", stringsAsFactors = FALSE)
head(crime_data)
We aggregate the sum of crimes in each borough
crime_ag <- aggregate(CrimeCount ~ Borough, FUN = sum, data = crime_data)
head(crime_ag, 2)
We now join the london data and the crime data by name and Borough, and then use the tmap library to plot a basic map
lnd@data
lnd@data <- left_join(lnd@data, crime_ag, by = c('name' = 'Borough'))
qtm(lnd, "CrimeCount") # plot the basic map
This is from the excellent Geocomputation book
pacman::p_load(
sf,
raster,
dplyr,
spData,
spDataLarge
)
## Installing package into '/home/edwallitt/R/x86_64-pc-linux-gnu-library/3.6'
## (as 'lib' is unspecified)
## Warning: package 'spDataLarge' is not available (for R version 3.6.2)
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'spDataLarge'
## Warning in pacman::p_load(sf, raster, dplyr, spData, spDataLarge): Failed to install/load:
## spDataLarge
Let’s create a random lat long point for london. We then us st_set_crs() with 4327 to project the lat long version.
london = data.frame(lon = -0.1, lat = 51.5) %>%
st_as_sf(coords = c("lon", "lat"))
london_geo = st_set_crs(london, 4326)
st_is_longlat(london_geo)
## [1] TRUE
We now try using EPSG 27700
london_proj = data.frame(x = 530000, y = 180000) %>%
st_as_sf(coords = 1:2, crs = 27700)
st_crs(london_proj)
## Coordinate Reference System:
## User input: EPSG:27700
## wkt:
## PROJCS["OSGB 1936 / British National Grid",
## GEOGCS["OSGB 1936",
## DATUM["OSGB_1936",
## SPHEROID["Airy 1830",6377563.396,299.3249646,
## AUTHORITY["EPSG","7001"]],
## TOWGS84[446.448,-125.157,542.06,0.15,0.247,0.842,-20.489],
## AUTHORITY["EPSG","6277"]],
## PRIMEM["Greenwich",0,
## AUTHORITY["EPSG","8901"]],
## UNIT["degree",0.0174532925199433,
## AUTHORITY["EPSG","9122"]],
## AUTHORITY["EPSG","4277"]],
## PROJECTION["Transverse_Mercator"],
## PARAMETER["latitude_of_origin",49],
## PARAMETER["central_meridian",-2],
## PARAMETER["scale_factor",0.9996012717],
## PARAMETER["false_easting",400000],
## PARAMETER["false_northing",-100000],
## UNIT["metre",1,
## AUTHORITY["EPSG","9001"]],
## AXIS["Easting",EAST],
## AXIS["Northing",NORTH],
## AUTHORITY["EPSG","27700"]]
london2 = st_transform(london_geo, 27700)
st_distance(london2, london_proj)
## Units: [m]
## [,1]
## [1,] 2017.95
This is a function to convert long lat to UTM
lonlat2UTM = function(lonlat) {
utm = (floor((lonlat[1] + 180) / 6) %% 60) + 1
if(lonlat[2] > 0) {
utm + 32600
} else{
utm + 32700
}
}
We now use the function to convert a single lat long and project it using EPSG 4326
epsg_utm_auk = lonlat2UTM(c(174.7, -36.9))
epsg_utm_lnd = lonlat2UTM(st_coordinates(london))
st_crs(epsg_utm_auk)$proj4string
## [1] "+proj=utm +zone=60 +south +datum=WGS84 +units=m +no_defs "
#> [1] "+proj=utm +zone=60 +south +datum=WGS84 +units=m +no_defs"
st_crs(epsg_utm_lnd)$proj4string
## [1] "+proj=utm +zone=30 +datum=WGS84 +units=m +no_defs "
#> [1] "+proj=utm +zone=30 +datum=WGS84 +units=m +no_defs"
london_geo
cycle_hire_osm
cycle_hire_osm_projected <- st_transform(cycle_hire_osm, 27700)
cycle_hire_osm_projected
Two categories of function
They all begin with ‘stats_’.
An example of one that is automatically used is stats_bin as part of geom_histogram()
ggplot(iris, aes(Sepal.Width)) + geom_histogram(binwidth = 0.5)
ggplot(msleep, aes(bodywt, y = 1)) +
geom_jitter() +
scale_x_log10()
ggplot(msleep, aes(log10(bodywt), y = 1)) +
geom_jitter() +
scale_x_continuous(limit = c(-3,4), breaks = -3:4) +
annotation_logticks(sides = "b")
labs ReferenceA full set of themes can be seem here.
This is a good example which shows the various elements you can set in labs
mtcars2 <- within(mtcars, {
vs <- factor(vs, labels = c("V-shaped", "Straight"))
am <- factor(am, labels = c("Automatic", "Manual"))
cyl <- factor(cyl)
gear <- factor(gear)
})
p1 <- ggplot(mtcars2) +
geom_point(aes(x = wt, y = mpg, colour = gear)) +
labs(title = "Fuel economy declines as weight increases",
subtitle = "(1973-74)",
caption = "Data from the 1974 Motor Trend US magazine.",
tag = "Figure 1",
x = "Weight (1000 lbs)",
y = "Fuel economy (mpg)",
colour = "Gears")
p1 + theme_gray() # the default
hist_df <- gapminder %>%
filter(year == 2007)
ggplot(hist_df, aes(lifeExp)) +
geom_histogram(binwidth = 5, colour = "white", fill = "#1380A1") +
geom_hline(yintercept = 0, size = 1, colour="#333333") +
bbc_style() +
scale_x_continuous(limits = c(35, 95),
breaks = seq(40, 90, by = 10),
labels = c("40", "50", "60", "70", "80", "90 years")) +
labs(title = "How life expectancy varies",
subtitle = "Distribution of life expectancy in 2007")
## Warning: Removed 2 rows containing missing values (geom_bar).
A library for creating professional-looking plots
pacman::p_load(
ggpubr,
colorspace,
wesanderson,
ggrepel
)
We shall be working with iris dataset
head(iris)
First, we shall create a histogram using ggpubr
gghistogram(iris, x="Sepal.Length", color="Species", fill="Species", palette = wes_palette("Darjeeling1"))
## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.
This would probably be easier to read as a density plot
ggdensity(iris, x="Sepal.Length", color="Species", fill="Species", palette = wes_palette("Darjeeling1"))
ggboxplot(iris, x = "Species", y = "Sepal.Length", color = "Species", palette = qualitative_hcl(3, palette = "Dynamic"), add = "jitter", shape = "Species")
## Dot chart
Moving onto some multivariate charts
head(mtcars)
We shall add a column to this dataset representing deviaation from mpg column from the average
mtcars$mpg_dev <- (mtcars$mpg -mean(mtcars$mpg)) / sd(mtcars$mpg)
mtcars$name <- row.names(mtcars)
mtcars$cyl_discr <- as.factor(mtcars$cyl)
Now we can visualise how each car deviates from the average, grouping by number of cylinders
ggdotchart(mtcars,
x = "name",
y = "mpg_dev",
color = "cyl_discr",
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
sorting = "descending",
add = "segments",
add.params = list(color = "lightgray", size = 1),
group = "cyl_discr",
dot.size = 2,
rotate = TRUE,
title = "Miles per gallon",
subtitle = "Per number of cylinders",
caption = "Package: ggpubr") +
geom_hline(yintercept = 0, linetype = 2, color = "lightgray") +
font("y.text", size=5) +
font("title", color = darken("#00AFBB", amount = 0.3)) +
font("caption", face = "italic")
p1 <- ggscatter(mtcars, x = "wt", y = "mpg", color="cyl_discr",
palette = "jco", shape = "cyl_discr", label = "name", repel = TRUE,
label.select = dplyr::filter(mtcars, mpg > 25 | wt > 5) %>% .$name)
p1
Adding a regression line
p2 <- ggscatter(mtcars, x = "wt", y = "mpg", color = "cyl_discr", palette = "jco",
shape = "cyl_discr", add = "reg.line", conf.int=TRUE)
p2
## `geom_smooth()` using formula 'y ~ x'
Now we can use patchwork to add these two charts together
pacman::p_load(
patchwork
)
p1 + p2
## `geom_smooth()` using formula 'y ~ x'
A raving by Dr Ed Wallitt
ed@wallitt.io