1. Các biểu đồ cơ bản.

1.1 Biểu đồ scatter

# install.packages("ggplot2")
# load package and data
options(scipen=999)  # turn-off scientific notation like 1e+48
library(ggplot2)
theme_set(theme_bw())  # pre-set the bw theme.
data("midwest", package = "ggplot2")
# midwest <- read.csv("http://goo.gl/G1K41K")  # bkup data source

# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) + 
  geom_smooth(method="loess", se=F) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) + 
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot", 
       caption = "Source: midwest")

plot(gg)
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

1.2 Biểu đồ scatter với hình tròn bao.

# install 'ggalt' pkg
# devtools::install_github("hrbrmstr/ggalt")
options(scipen = 999)
library(ggplot2)
library(ggalt)
## Warning: package 'ggalt' was built under R version 3.4.2
midwest_select <- midwest[midwest$poptotal > 350000 & 
                            midwest$poptotal <= 500000 & 
                            midwest$area > 0.01 & 
                            midwest$area < 0.1, ]

# Plot
ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) +   # draw points
  geom_smooth(method="loess", se=F) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) +   # draw smoothing line
  geom_encircle(aes(x=area, y=poptotal), 
                data=midwest_select, 
                color="red", 
                size=2, 
                expand=0.08) +   # encircle
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot + Encircle", 
       caption="Source: midwest")
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

1.3 Biểu đồ count.

# load package and data
library(ggplot2)
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")

# Scatterplot
theme_set(theme_bw())  # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
g + geom_count(col="tomato3", show.legend=F) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Counts Plot")

1.4 Biểu đồ Marginal Histogram / Boxplot

# load package and data
library(ggplot2)
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 3.4.2
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")

# Scatterplot
theme_set(theme_bw())  # pre-set the bw theme.
mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) + 
  geom_count() + 
  geom_smooth(method="lm", se=F)

ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "boxplot", fill="transparent")

# ggMarginal(g, type = "density", fill="transparent")

2. Các loại biểu đồ nâng cao.

2.1 Biểu đồ dân số.

# Read data
library(readxl)
library(ggfortify)
Pop <- read_excel('D:/3.Project/6.Paper/Dang Tuan Vu/chart/Popular.xlsx')
Pop$GroupAge <- factor(Pop$GroupAge, levels = c('0 - 4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55 - 59','60-64','65-69','70-74'
,'75-79','80-84','85-89','90-94','95-99','100+'))
# X Axis Breaks and Labels 
brks <- seq(-15000000, 15000000, 5000000)
lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")

# Plot
theme_set(theme_bw(base_family = 'serif'))
ggplot(Pop, aes(x = GroupAge, y = Users, fill = Gender)) +   # Fill column
                              geom_bar(stat = "identity", width = .6) +   # draw the bars
                              scale_y_continuous(breaks = brks,   # Breaks
                                                 labels = lbls) + # Labels
                              coord_flip() +  # Flip axes
                              labs(title="Populator Chart") +
                              # theme_tufte() +  # Tufte theme from ggfortify
                              theme(plot.title = element_text(hjust = .5), 
                                    axis.ticks = element_blank(),
                                    legend.position = 'bottom') +   # Centre plot title
                              scale_fill_brewer(palette = "Dark2")  # Color palette

2.2 Biểu đồ tương quan biến.

# devtools::install_github("kassambara/ggcorrplot")
library(ggplot2)
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 3.4.2
# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)

# Plot
ggcorrplot(corr, hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="square", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlation matrix in mtcars",
           ggtheme = ggplot2::theme_bw) +
  theme_bw(base_family = 'serif') + 
  theme(text = element_text(family = "serif"),
        plot.title = element_text(hjust = .5)) +
  labs(x = '',
       y = '')

2.3 Biểu đồ dumbell

# Read data
library(readxl)
# library(ggfortify)
library(ggalt)
# theme_set(theme_bw(base_family = 'serif'))
pct <- read_excel('D:/3.Project/6.Paper/Dang Tuan Vu/chart/GDP.xlsx')
pct$Area <- factor(pct$Area,levels = as.character(pct$Area))

breaks <- seq(0.05,0.25,by = 0.05)
percent <- paste0(as.character(c(seq(5,25,5))), "%")

gg <- ggplot(pct, aes(x=pct_2013, xend=pct_2014, y=Area, group=Area)) + 
        geom_dumbbell(color="#a3c4dc", 
                      size=0.75, 
                      point.colour.l="#0e668b") + 
        scale_x_continuous(breaks = breaks,
                           labels = percent) +
        labs(x=NULL, 
             y=NULL, 
             title="Dumbbell Chart", 
             subtitle="Pct Change: 2013 vs 2014", 
             caption="Source: https://github.com/hrbrmstr/ggalt") +
        theme(plot.title = element_text(hjust=0.5, face="bold"),
              plot.background=element_rect(fill="#f7f7f7"),
              panel.background=element_rect(fill="#f7f7f7"),
              panel.grid.minor=element_blank(),
              panel.grid.major.y=element_blank(),
              panel.grid.major.x=element_line(),
              axis.ticks=element_blank(),
              legend.position="top",
              panel.border=element_blank())
## Warning: Ignoring unknown parameters: point.colour.l
plot(gg)

2.4 Biểu đồ Lollipop.

Không legend

library(ggplot2)
theme_set(theme_bw(base_family = 'serif'))

# Plot
Data <- mtcars
Data <- Data[order(Data$mpg), ]
Data$name <- rownames(Data)
Data$name <- factor(Data$name, levels = Data$name)

Data %>% 
  slice(10:20) %>% 
  ggplot(aes(x=name, y=mpg, label = mpg)) +
  geom_point(size=6) + 
  geom_segment(aes(x=name, 
                   xend=name, 
                   y=0, 
                   yend=mpg)) + 
  geom_text(color = 'white',family = 'serif',size = 2) +
  labs(title="Lollipop Chart", 
       subtitle="Cars Vs MPG", 
       caption="source: mtcars") + 
  theme(plot.title = element_text(hjust=0.5, face="bold")) +
  coord_flip() +
  labs(x = '',
       y = '') 

Có legend

theme_set(theme_bw(base_family = 'serif'))
Data$mpg_z <- round((Data$mpg - mean(Data$mpg))/sd(Data$mpg), 2)  # compute normalized mpg
Data$mpg_type <- ifelse(Data$mpg_z < 0, "below", "above")  # above / below avg flag
Data <- Data[order(Data$mpg_z), ]

 # Diverging Barcharts
Data %>% 
  ggplot(aes(x=name, y=mpg_z, label=mpg_z)) + 
  geom_point(stat='identity', aes(col=mpg_type), size=6)  +
  scale_color_manual(name="Mileage", 
                     labels = c("Above Average", "Below Average"), 
                     values = c("above"="#00ba38", "below"="#f8766d")) + 
  geom_text(color="white", size=2) +
  labs(title="Diverging Dot Plot", 
       subtitle="Normalized mileage from 'mtcars': Dotplot") + 
  ylim(-2.5, 2.5) +
  coord_flip()

2.5 Headmap

## Example data
x <- seq(1,10, length.out=20)
y <- seq(1,10, length.out=20)
data <- data.frame(x, y)
data$Z <- runif(20, 0, 20)

# Levelplot with ggplot2
library(ggplot2)
theme_set(theme_bw(base_family = 'serif'))
# ggplot(data, aes(x, y, Z)) + geom_tile(aes(fill = Z)) + theme_bw()
 
# To change the color of the gradation :
ggplot(data, aes(x, y, Z)) + geom_tile(aes(fill = Z)) + 
    theme_bw() + 
    scale_fill_gradient(low="white", high="blue") 

3. Các dạng biểu đồ cho các phân tích.

3.1 Tỷ lệ giữ chân khách hàng (Retension rate).

# loading libraries
# Source: http://analyzecore.com/2015/12/10/cohort-analysis-retention-rate-visualization-r/
library(dplyr)
library(reshape2)
library(ggplot2)
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# creating data sample
set.seed(10)
cohorts <- data.frame(cohort = paste('cohort', formatC(c(1:36), width=2, format='d', flag='0'), sep = '_'),
 Y_00 = sample(c(1300:1500), 36, replace = TRUE),
 Y_01 = c(sample(c(800:1000), 36, replace = TRUE)),
 Y_02 = c(sample(c(600:800), 24, replace = TRUE), rep(NA, 12)),
 Y_03 = c(sample(c(400:500), 12, replace = TRUE), rep(NA, 24)))
# simulating seasonality (Black Friday)
cohorts[c(11, 23, 35), 2] <- as.integer(cohorts[c(11, 23, 35), 2] * 1.25)
cohorts[c(11, 23, 35), 3] <- as.integer(cohorts[c(11, 23, 35), 3] * 1.10)
cohorts[c(11, 23, 35), 4] <- as.integer(cohorts[c(11, 23, 35), 4] * 1.07)
 
# calculating retention rate and preparing data for plotting
df_plot <- melt(cohorts, id.vars = 'cohort', value.name = 'number', variable.name = 'year_of_LT')
 
df_plot <- df_plot %>%
 group_by(cohort) %>%
 arrange(year_of_LT) %>%
 mutate(number_prev_year = lag(number),
 number_Y_00 = number[which(year_of_LT == 'Y_00')]) %>%
 ungroup() %>%
 mutate(ret_rate_prev_year = number / number_prev_year,
 ret_rate = number / number_Y_00,
 year_cohort = paste(year_of_LT, cohort, sep = '-'))
 
##### The first way for plotting cycle plot via scaling
# calculating the coefficient for scaling 2nd axis
k <- max(df_plot$number_prev_year[df_plot$year_of_LT == 'Y_01'] * 1.15) / min(df_plot$ret_rate[df_plot$year_of_LT == 'Y_01'])

a. Retention rate cycle plot

# retention rate cycle plot
ggplot(na.omit(df_plot), aes(x = year_cohort, y = ret_rate, group = year_of_LT, color = year_of_LT)) +
 theme_bw() +
 geom_point(size = 4) +
 geom_text(aes(label = percent(round(ret_rate, 2))),
 size = 4, hjust = 0.4, vjust = -0.6, fontface = "plain") +
 # smooth method can be changed (e.g. for "lm")
 geom_smooth(size = 2.5, method = 'loess', color = 'darkred', aes(fill = year_of_LT)) +
 geom_bar(aes(y = number_prev_year / k, fill = year_of_LT), alpha = 0.2, stat = 'identity') +
 geom_bar(aes(y = number / k, fill = year_of_LT), alpha = 0.6, stat = 'identity') +
 geom_text(aes(y = 0, label = cohort), color = 'white', angle = 90, size = 4, hjust = -0.05, vjust = 0.4) +
 geom_text(aes(y = number_prev_year / k, label = number_prev_year),
 angle = 90, size = 4, hjust = -0.1, vjust = 0.4) +
 geom_text(aes(y = number / k, label = number),
 angle = 90, size = 4, hjust = -0.1, vjust = 0.4) +
 theme(legend.position='none',
 plot.title = element_text(size=20, face="bold", vjust=2),
 axis.title.x = element_text(size=18, face="bold"),
 axis.title.y = element_text(size=18, face="bold"),
 axis.text = element_text(size=16),
 axis.text.x = element_blank(),
 axis.ticks.x = element_blank(),
 axis.ticks.y = element_blank(),
 panel.border = element_blank(),
 panel.grid.major = element_blank(),
 panel.grid.minor = element_blank()) +
 labs(x = 'Year of Lifetime by Cohorts', y = 'Number of Customers / Retention Rate') +
 ggtitle("Customer Retention Rate - Cycle plot")

b. Retention rate and number customer (Comboo chart)

##### The second way for plotting cycle plot via multi-plotting
# plot #1 - Retention rate
p1 <- ggplot(na.omit(df_plot), aes(x = year_cohort, y = ret_rate, group = year_of_LT, color = year_of_LT)) +
 theme_bw() +
 geom_point(size = 4) +
 geom_text(aes(label = percent(round(ret_rate, 2))),
 size = 4, hjust = 0.4, vjust = -0.6, fontface = "plain") +
 geom_smooth(size = 2.5, method = 'loess', color = 'darkred', aes(fill = year_of_LT)) +
 theme(legend.position='none',
 plot.title = element_text(size=20, face="bold", vjust=2),
 axis.title.x = element_blank(),
 axis.title.y = element_text(size=18, face="bold"),
 axis.text = element_blank(),
 axis.ticks.x = element_blank(),
 axis.ticks.y = element_blank(),
 panel.border = element_blank(),
 panel.grid.major = element_blank(),
 panel.grid.minor = element_blank()) +
 labs(y = 'Retention Rate') +
 ggtitle("Customer Retention Rate - Cycle plot")
 
# plot #2 - number of customers
p2 <- ggplot(na.omit(df_plot), aes(x = year_cohort, group = year_of_LT, color = year_of_LT)) +
 theme_bw() +
 geom_bar(aes(y = number_prev_year, fill = year_of_LT), alpha = 0.2, stat = 'identity') +
 geom_bar(aes(y = number, fill = year_of_LT), alpha = 0.6, stat = 'identity') +
 geom_text(aes(y = number_prev_year, label = number_prev_year),
 angle = 90, size = 4, hjust = -0.1, vjust = 0.4) +
 geom_text(aes(y = number, label = number),
 angle = 90, size = 4, hjust = -0.1, vjust = 0.4) +
 geom_text(aes(y = 0, label = cohort), color = 'white', angle = 90, size = 4, hjust = -0.05, vjust = 0.4) +
 theme(legend.position='none',
 plot.title = element_text(size=20, face="bold", vjust=2),
 axis.title.x = element_text(size=18, face="bold"),
 axis.title.y = element_text(size=18, face="bold"),
 axis.text = element_blank(),
 axis.ticks.x = element_blank(),
 axis.ticks.y = element_blank(),
 panel.border = element_blank(),
 panel.grid.major = element_blank(),
 panel.grid.minor = element_blank()) +
 scale_y_continuous(limits = c(0, max(df_plot$number_Y_00 * 1.1))) +
 labs(x = 'Year of Lifetime by Cohorts', y = 'Number of Customers')
 
# multiplot
grid.arrange(p1, p2, ncol = 1)

c. Retention rate bubble chart

# retention rate bubble chart
ggplot(na.omit(df_plot), aes(x = cohort, y = ret_rate, group = cohort, color = year_of_LT)) +
 theme_bw() +
 scale_size(range = c(15, 40)) +
 geom_line(size = 2, alpha = 0.3) +
 geom_point(aes(size = number_prev_year), alpha = 0.3) +
 geom_point(aes(size = number), alpha = 0.8) +
 geom_smooth(linetype = 2, size = 2, method = 'loess', aes(group = year_of_LT, fill = year_of_LT), alpha = 0.2) +
 geom_text(aes(label = paste0(number, '/', number_prev_year, '\n', percent(round(ret_rate, 2)))),
 color = 'white', size = 3, hjust = 0.5, vjust = 0.5, fontface = "plain") +
 theme(legend.position='none',
 plot.title = element_text(size=20, face="bold", vjust=2),
 axis.title.x = element_text(size=18, face="bold"),
 axis.title.y = element_text(size=18, face="bold"),
 axis.text = element_text(size=16),
 axis.text.x = element_text(size=10, angle=90, hjust=.5, vjust=.5, face="plain"),
 axis.ticks.x = element_blank(),
 axis.ticks.y = element_blank(),
 panel.border = element_blank(),
 panel.grid.major = element_blank(),
 panel.grid.minor = element_blank()) +
 labs(x = 'Cohorts', y = 'Retention Rate by Year of Lifetime') +
 ggtitle("Customer Retention Rate - Bubble chart")

# retention rate falling drops chart
ggplot(df_plot, aes(x = cohort, y = ret_rate, group = cohort, color = year_of_LT)) +
 theme_bw() +
 scale_size(range = c(15, 40)) +
 scale_y_continuous(limits = c(0, 1)) +
 geom_line(size = 2, alpha = 0.3) +
 geom_point(aes(size = number), alpha = 0.8) +
 geom_text(aes(label = paste0(number, '\n', percent(round(ret_rate, 2)))),
 color = 'white', size = 3, hjust = 0.5, vjust = 0.5, fontface = "plain") +
 theme(legend.position='none',
 plot.title = element_text(size=20, face="bold", vjust=2),
 axis.title.x = element_text(size=18, face="bold"),
 axis.title.y = element_text(size=18, face="bold"),
 axis.text = element_text(size=16),
 axis.text.x = element_text(size=10, angle=90, hjust=.5, vjust=.5, face="plain"),
 axis.ticks.x = element_blank(),
 axis.ticks.y = element_blank(),
 panel.border = element_blank(),
 panel.grid.major = element_blank(),
 panel.grid.minor = element_blank()) +
 labs(x = 'Cohorts', y = 'Retention Rate by Year of Lifetime') +
 ggtitle("Customer Retention Rate - Falling Drops chart")
## Warning: Removed 36 rows containing missing values (geom_path).
## Warning: Removed 36 rows containing missing values (geom_point).
## Warning: Removed 36 rows containing missing values (geom_text).

3.2 Phân loại khách hàng (Clusters customer).

4. Supper chart.

# https://rlbarter.github.io/superheat-examples/Organ/