factors

# setting a seed sets for constant randomization
set.seed(133)
a<- sample(month.name, 50, replace=TRUE)
# create a factor
f<- factor(a, levels=month.name, labels=month.abb)
is.factor(f)
## [1] TRUE

| levels and labels

# check the levels and labels of the new factor.  Notice it is not what you expect
levels(f)
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
labels(f)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30"
## [31] "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44" "45"
## [46] "46" "47" "48" "49" "50"

| transform to character and numeric

# transform the factor into a vector
as.character(f)  # it is better to use levels(f)[f]
##  [1] "Sep" "Jun" "Apr" "Jan" "Jan" "Sep" "Jun" "Feb" "Aug" "Apr" "Feb" "Jun"
## [13] "Jan" "Dec" "Jan" "Jan" "Sep" "Dec" "Mar" "Apr" "Feb" "Aug" "Aug" "Mar"
## [25] "Sep" "Mar" "Dec" "Aug" "Feb" "Sep" "Jan" "Aug" "Sep" "Oct" "Jan" "Aug"
## [37] "Jan" "Aug" "May" "Dec" "Apr" "Jul" "Mar" "Dec" "Apr" "Feb" "May" "Sep"
## [49] "Mar" "Sep"
as.numeric(f)
##  [1]  9  6  4  1  1  9  6  2  8  4  2  6  1 12  1  1  9 12  3  4  2  8  8  3  9
## [26]  3 12  8  2  9  1  8  9 10  1  8  1  8  5 12  4  7  3 12  4  2  5  9  3  9

| count

# now use the forcats library
# https://forcats.tidyverse.org/index.html 
library(forcats)
# notice Nov has 0 occurances
table(f)
## f
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec 
##   8   5   5   5   2   3   1   7   8   1   0   5
fct_count(f)
## # A tibble: 12 × 2
##    f         n
##    <fct> <int>
##  1 Jan       8
##  2 Feb       5
##  3 Mar       5
##  4 Apr       5
##  5 May       2
##  6 Jun       3
##  7 Jul       1
##  8 Aug       7
##  9 Sep       8
## 10 Oct       1
## 11 Nov       0
## 12 Dec       5

| drop unused levels (Nov)

# remove unused levels
f<- fct_drop(f)
levels(f)
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Dec"

| reorder making Dec the first month

# reorder the levels: make Dec the first month
f<- fct_relevel(f, c("Dec"))
levels(f)
##  [1] "Dec" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct"

| order using frequency

# order according to frequency
f<- fct_infreq(f)
levels(f)
##  [1] "Jan" "Sep" "Aug" "Dec" "Feb" "Mar" "Apr" "Jun" "May" "Jul" "Oct"

| use only the top 3 most frequent levels and collapse the rest into others

# show only top 3 levels and collapse the rest into others
f<- fct_lump(f, 3)
levels(f)
## [1] "Jan"   "Sep"   "Aug"   "Other"

| reverse the order of levels

# reverse the order of levels
f<- fct_rev(f)
levels(f)
## [1] "Other" "Aug"   "Sep"   "Jan"

| recode some or all levels using new levels

# recode the levels
fct_recode(f, "Boring"="Jan", "Hot"="Aug")
##  [1] Sep    Other  Other  Boring Boring Sep    Other  Other  Hot    Other 
## [11] Other  Other  Boring Other  Boring Boring Sep    Other  Other  Other 
## [21] Other  Hot    Hot    Other  Sep    Other  Other  Hot    Other  Sep   
## [31] Boring Hot    Sep    Other  Boring Hot    Boring Hot    Other  Other 
## [41] Other  Other  Other  Other  Other  Other  Other  Sep    Other  Sep   
## Levels: Other Hot Sep Boring

ggplot

data

| select data to be used for plotting

ds<- gapminder::gapminder

ggplot(data=ds, aes(x=gdpPercap, y=lifeExp))

geoms

| geom_point with x and y aesthetics

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()

| geom_point with other aesthetics

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp,  size=pop, color=continent))+
  geom_point(alpha=0.6)

| geom_smooth

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()+
  stat_smooth(method = 'loess', formula='y ~ x', se=F )

| geom_line

ds %>% filter(country=="Jordan") %>% 
  ggplot(aes(x=year, y=lifeExp))+  # add color=continent
  geom_point()+
  geom_line(na.rm=T)

| geom_line another example

ds %>% 
  filter(continent=="Europe") %>%  # select European countries
  # reorder colors according to the last value
  ggplot(aes(x=year, y=pop, color=fct_reorder(country, pop, tail,n=1, .desc=T))) + 
  geom_point() +
  geom_line(na.rm=T)+    # always add na.rm=T
  labs(color="Countries", x="")+
  theme_bw()+
  scale_y_continuous(labels=c("0", "20M", "40M", "60M", "80M"))+  # you can use breaks and labels
  scale_x_continuous(breaks=seq(1952, 2007, 5))+
  theme(axis.text.x = element_text(size=8))

| geom_point

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent, y=lifeExp))+
  geom_point()

| geom_jitter

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent, y=lifeExp, size=pop, col=continent))+
  geom_jitter(width=0.2)

| geom_histogram

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=lifeExp))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

| geom_bar

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent))+   # you can add fill=country
  geom_bar()

| geom_bar with numbers on top

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent))+   # you can add fill=country
  geom_bar()+ 
  geom_text(aes(label=..count..), stat='count', color="red", vjust=-0.3)

| geom_bar with fill colors

ds %>% filter(year=="2007") %>% 
  slice(1:20) %>% 
  ggplot(aes(x=continent, fill=country))+   # you can add fill=country
  geom_bar()

| geom_bar with fill position

ds %>% filter(year=="2007") %>%
  slice(1:20) %>% 
  ggplot(aes(x=continent, fill=country))+   # you can add fill=country
  geom_bar(position="fill") # try position="stack"  , position="fill"   , position="dodge"

| geom_bar ordered

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=fct_infreq(continent)))+   # you can add fill=country
  geom_bar()

| geom_piechart

There is no geom_piechart the pie chart is actually a round bar_chart

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x="",fill=continent))+   # you can add fill=country
  geom_bar()+
  coord_polar(theta="y")+
  theme_void()

| pie_chart with percentages

ds %>% filter(year=="2007") %>% 
  count(continent) %>% 
  ggplot(aes(x="", y=n, fill=continent))+   # you can add fill=country
  geom_col()+
  geom_text(aes(label = scales::percent(n/sum(n),1)), position = position_stack(vjust = 0.5), size=5) +
  coord_polar(theta="y")+
  theme_void()

| pie_chart function

pie_chart<- function(DS){
  # DS needs to have two columns at least: FILL (*)character) and n (numeric)
  ggplot(DS, aes(x="", y=n, fill=FILL))+   # you can add fill=country
  geom_col()+
  geom_text(aes(label = scales::percent(n/sum(n),1)), position = position_stack(vjust = 0.5), size=5) +
  coord_polar(theta="y")+
  theme_void()
} 


ds %>% filter(year=="2007") %>% 
  count(continent) %>% 
  mutate(FILL=continent) %>% 
  pie_chart()

pie_chart
## function(DS){
##   # DS needs to have two columns at least: FILL (*)character) and n (numeric)
##   ggplot(DS, aes(x="", y=n, fill=FILL))+   # you can add fill=country
##   geom_col()+
##   geom_text(aes(label = scales::percent(n/sum(n),1)), position = position_stack(vjust = 0.5), size=5) +
##   coord_polar(theta="y")+
##   theme_void()
## }

| geom_col

ds %>% filter(year=="2007") %>% slice(c(1:20)) %>% 
  group_by(continent) %>% 
  count(country) %>% 
  ggplot(aes(x=continent, y=n, fill=country))+       
  geom_col()   # position="stack" or "fill" or "dodge"

| geom_col reordered

ds %>% filter(year=="2007") %>% slice(c(1:20)) %>% 
  group_by(continent) %>% 
  count(country) %>% 
  mutate(total=sum(n)) %>% 
  ungroup() %>% 
  ggplot()+       
  geom_col(aes(x=reorder(continent, -total), y=n, fill=country))   # position="stack" or "fill" or "dodge"

| geom_col with fill position

ds %>% filter(year=="2007") %>% slice(c(1:20)) %>% 
  group_by(continent) %>% 
  count(country) %>% 
  ggplot(aes(x=continent, y=n, fill=country))+       
  geom_col(position="fill")  # position="stack" or "fill" or "dodge"

| geom_boxplot

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent, y=lifeExp))+
  geom_boxplot()

| geom_boxplot and geom_jitter

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent, y=lifeExp))+
  geom_boxplot()+
  geom_jitter(width=0.2)

| donut plot

ds %>% 
count(continent) %>% 
  mutate(total=sum(n), fraction=n/total) %>% 
  mutate(ymax=cumsum(fraction), ymin=c(0, head(ymax, n=-1)), 
         labelPosition = (ymax + ymin) / 2, 
         label=paste0(round(fraction*100,0),"%")) %>%
  ggplot(aes(ymax=ymax, ymin=ymin, xmax=3.5, xmin=2.5, fill=continent)) +
  geom_rect() +
  #expand_limits(x = c(10, 10), y=c(10,10)) +
  geom_text( x=4.1, aes(y=labelPosition, label=paste0(continent, "\n", label)), color="black",size=5,  fontface = "bold") +
  scale_fill_lancet()+
  coord_polar(theta="y", direction=-1) +
  xlim(c(1.5, 4.1)) +
  guides(fill = guide_legend(reverse=T))+
  theme_void() +
  theme(legend.position = "none", plot.margin=unit(c(0,0,0,00),"lines")) 

coordinates

| coord_flip flips x and y axes

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=fct_rev(fct_infreq(continent))))+   # you can add fill=country
  geom_bar()+
  geom_text(aes(x= continent, label=..count..), stat='count', color="black", hjust=-0.4)+
  coord_flip()

statistics

| geom boxplot with stat_summary

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=continent, y=lifeExp))+
  geom_boxplot(alpha=0.5)+
  stat_summary(fun.y=median, colour="black", geom="text", vjust=-0.7, aes(label=round(..y.., digits=1)))+
  stat_summary(fun.y=mean, geom="point", color="red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## `fun.y` is deprecated. Use `fun` instead.

| stat_smooth()

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()+
  stat_smooth(method = "lm", formula='y ~ x', se = T)

labs

| add labels to axes and legends

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))+
  labs(title="Title", subtitle = "subtitle", caption="caption", x="GDP per Capita", y="Life Expectency in years", color="CONTINENT")

themes

| select themes; most famous: theme_classic() theme_void() theme_minimal() theme_bw()

https://ggplot2.tidyverse.org/reference/ggtheme.html

p<-ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))

p + theme_classic() + labs(title="theme_classic")

p + theme_void() + labs(title="theme_void")

p + theme_minimal() + labs(title="theme_minimal")

p + theme_bw() + labs(title="theme_bw")

| theme arguments

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))+
  labs(x="GDP per Capita", y="Life Expectency in years", color="CONTINENT")+
  theme_classic()+
  theme(legend.position = c(0.7,0.3),
        axis.text.x=element_text(angle=45, vjust=0.9, hjust=0.9, size=10))

scales

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))+
  labs(x="GDP per Capita", y="Life Expectency in years", color="CONTINENT")+
  theme_classic()+
  # scale_alpha_continuous()+
  # scale_alpha_manual()+
  # scale_y_discrete()+
  # scale_y_reverse()+
  # scale_y_log10()+
  # scale_y_sqrt()+
  # scale_y_continuous(labels = scales::percent_format(scale = 1))   # make access percent
  scale_x_log10(limits=c(1000, 100000), breaks=c(1000, 10000, 100000), labels=c("1k", "10k", "100k"))+
  scale_y_continuous(expand = expansion(mult = c(0.1,0.2)))+  # expand the axis from both sides
  scale_color_manual(labels=c("AF", "AM", "AS", "EU", "OC"), values=c("black", "grey", "pink", "red", "#38E54D"))
## Warning: Removed 20 rows containing missing values (geom_point).

| RColorBrewer

add amazing colors

library(RColorBrewer)
ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))+
  labs(x="GDP per Capita", y="Life Expectency in years", color="CONTINENT")+
  theme_classic()+
  scale_color_brewer(palette="Set1")

guides

legends full control

p<- ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp, size=pop, shape=continent, color=continent))+
  geom_point()+
  theme_classic()

# show plot
p

# remove one legend:
p + guides(size="none")

# change order of legends:
p + guides(size=guide_legend(order=1),
           color=guide_legend(order=2),
           shape=guide_legend(order=2))

# change legend parameters, size, keyheight, keywidth, nrow, ncol
p + guides(size=guide_legend(nrow=2),
           color=guide_legend(nrow=2, keyheight = 1, keywidth =1, override.aes = list(size = 7)),
           shape="none")

facet

| split plot to multiple based on grouping

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp, color=continent))+
  geom_point()+
  theme_classic()+
  facet_wrap(~continent)

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp, color=continent))+
  geom_point()+
  theme_classic()+
  facet_wrap(~continent, scales="free", as.table=T, strip.position = "right")+
  theme(strip.background = element_blank(), strip.text = element_blank(), legend.position = c(0.85,0.2))+
  labs(title="Free scales and remove strips")

ggarrange

| combine multiple plots

A <- ds %>% filter(year=="2007", continent=="Asia") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()

B <- ds %>% filter(year=="2007", continent=="Africa") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()

C <- ds %>% filter(year=="2007", continent=="Europe") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()

D <- ds %>% filter(year=="2007", continent=="Americas") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point()

ggpubr::ggarrange(A,B,C,D, ncol=2, nrow=2, labels="AUTO")  

# labels can be also a vector.  try c("a1", "a2", "b", "c")
# if multiple plots have the same legend you can show only one legend using common.legend = T

ggsave

| save plots to folder

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))

ggsave("Fig1.jpeg", height = 6, width=10, dpi=600)

Full example

ds %>% 
  filter(year==2007) %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  stat_smooth(method = "loess", se=F,  formula= 'y ~ x')+
  geom_point(aes(size=pop, color=ifelse(country=="Jordan", "black", continent)))+
  ggrepel::geom_text_repel(aes(label=ifelse(country=="Jordan", as.character(country), NA)), vjust=-3, size=5)+
  #geom_text()+
  theme_classic()+
  labs(x="GDP per capita", y="Life Expectancy", 
       title="Gapminder analysis",
       caption=Sys.Date(), 
       color="Continent")+
  theme(title=element_text(size=15, color="red"),
        axis.title = element_text(size=15, color="black", hjust=0.5, vjust=100),
        axis.text=element_text(size=12, color="blue"),
        legend.title = element_text(color="Green"),
        legend.text = element_text(size=12),
        legend.position = c(0.92,0.4),
        legend.background = element_blank())+
  scale_x_log10()+
  scale_color_brewer(palette="Set1", labels=c("Af", "Am", "As", "Eu", "Au", "Jo"))+
  guides(size="none", col=guide_legend(keyheight = 1, keywidth =1, override.aes = list(size = 8)))+
  coord_flip()
## Warning: Removed 141 rows containing missing values (geom_text_repel).

Additional tools

| annotate

add specific notes or lines to the plot

ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))+
  annotate("text", x=10000, y=40, label="test")+
  annotate("segment", x=10000, y=41, xend=12500, yend=50, col="red", linetype="dashed")

| geom_point with shapes and colors

df<- data.frame(X=1:20, Y=c(1:6, rep(3,14)))

ggplot(df, aes(x=X, y=Y, col=as.factor(X), shape=as.factor(Y)))+
  geom_point(size=5)+
  scale_x_continuous("Color", breaks=1:20)+
  scale_y_continuous("Shape", breaks=1:6)+
  guides(color="none")

| ggrepel: Best way to add text to points

ds %>% 
  filter(year==2007) %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  stat_smooth(method = "loess", se=F,  formula= 'y ~ x')+
  geom_point(aes(size=pop, color=continent))+
  theme_classic()+
  ggrepel::geom_text_repel(data=ds %>% filter(year=="2007", pop>1e8), aes(label=country))

| add labels to end of lines

ds %>% 
  filter(country!="Kuwait") %>% 
  ggplot(aes(x=year, y=gdpPercap, group=country))+
  geom_line(color="grey", na.rm=T)+
  geom_line(data=ds %>% filter(country %in% c("Jordan","Saudi Arabia")), aes(color=country), na.rm=T)+
  theme(legend.position = "none")+
  theme_classic()+
  geom_text(data = ds %>% filter(year == last(year), country %in% c("Jordan","Saudi Arabia")), 
            aes(label = country, 
                x = year + 1, 
                y = gdpPercap, 
                color = country), hjust=0) + 
          guides(color = FALSE) + 
          scale_x_continuous(breaks = unique(ds$year), expand = expansion(mult = c(0.1,0.3)))+
  scale_y_continuous(breaks=10000*0:5, labels=paste0(seq(0, 50, 10), "k"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

| ggsci: scientific themes

library(ggsci)
ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp))+
  geom_point(aes(color=continent))+
  labs(x="GDP per Capita", y="Life Expectency in years", color="CONTINENT")+
  theme_classic()+
  scale_color_nejm()

| population pyramid

# Population pyramid

 ds %>%  
  filter(year=="2007", continent=="Europe") %>% 
  mutate(Male=pop/2, Female=pop/2) %>%
  gather(Sex, n, -c(1:6)) %>%  
    ggplot(aes(x = ifelse(test = Sex == "Male", yes = -n, no = n), y = reorder(country, -n), fill = Sex)) +
    geom_col() +
    lemon::scale_x_symmetric(labels = c("40M", "30M", "20M", "10M", "0", "10M", "20M", "30M", "40M"), breaks=seq(-4e7,4e7,1e7)) +
    labs(x = "Population")+
    scale_fill_brewer(palette = "Set1")+
    theme_minimal()+
    theme(text=element_text(size=12))+
    labs(x="", y="")

| wrap long text

Hmisc::getHdata(pbc) 

pbc %>% 
  ggplot()+
  geom_bar(aes(x=str_wrap(drug, width = 10)))

| plotly package

plotly::ggplotly(ds %>% filter(year=="2007") %>% 
  ggplot(aes(x=gdpPercap, y=lifeExp,  size=pop, color=continent, label=country))+
  geom_point(alpha=0.8))

| gganimate

animated plots

GIF using gganimate

outside sources

| Change points to flags

enter image description here

enter image description here