Data Visualization (GSO, Part 1)

Nguyen Chi Dung

#-------------------------------------
#   Scatter Plot + Regression Line
#-------------------------------------
library(tidyverse)

# Load dữ liệu: 
library(gapminder)
data("gapminder")

# Tìm hiểu về ý nghĩa của các biến số của bộ dữ liệu: 
?gapminder

# Scatter Plot cơ bản: 

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() 

# Làm mờ các điểm + tô màu theo ý muốn: 
library(tidyverse)

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point(alpha = 0.2, color = "purple") 

# Nhấn mạnh đến một số Outliers" 

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point(alpha = 0.2, color = "purple") + 
  geom_point(data = gapminder %>% filter(gdpPercap > 60000), 
             aes(gdpPercap, lifeExp), color = "red", size = 2) ->> p1

p1

# Có thể đổi theme nếu muốn: 
p1 + theme_linedraw()

p1 + theme_light()

# Hoặc cố định theme: 
theme_set(theme_minimal())
p1

# Hiệu chỉnh trục X: 
p2 <- p1 + 
  scale_x_log10()

# Hiệu chỉnh tên trục + tiêu đề: 

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point(alpha = 0.2, color = "purple") + 
  scale_x_log10() + 
  scale_y_continuous(breaks = seq(20, 90, 10)) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/")

# Hiệu chỉnh tiếp cho trục x: 
library(scales)

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point(alpha = 0.2, color = "purple") + 
  scale_x_log10(labels = scales::dollar) + 
  scale_y_continuous(breaks = seq(20, 90, 10)) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") ->> g3 

g3

# Thêm đường hồi quy: 
g3 + geom_smooth(method = "lm")

g3 + geom_smooth(method = "lm", color = "blue", fill = "blue", alpha = 0.1)

g3 + geom_smooth(method = "lm", color = "orange", se = FALSE)

# Nhấn mạnh đến Việt Nam năm 2007 chẳng hạn: 

g3 + 
  geom_smooth(method = "lm", color = "orange", se = FALSE) + 
  geom_point(data = gapminder %>% filter(year == 2007 & country == "Vietnam"), 
             aes(gdpPercap, lifeExp), color = "red", size = 3) + 
  geom_text(data = gapminder %>% filter(year == 2007 & country == "Vietnam"), 
            aes(label = country))

# Hiệu chỉnh thêm nữa: 
library(ggrepel)

g3 + 
  geom_smooth(method = "lm", color = "orange", se = FALSE) + 
  geom_point(data = gapminder %>% filter(year == 2007 & country == "Vietnam"), 
             aes(gdpPercap, lifeExp), color = "red", size = 3) + 
  geom_text_repel(data = gapminder %>% filter(year == 2007 & country == "Vietnam"), 
            aes(label = country), force = 19)

# Câu hỏi mở rộng: Hiển thị thêm Ấn Độ, Trung Quốc, Thái Lan và Malaysia trong
# năm 2007 nhằm so sánh các quốc gia này với Việt Nam về tuổi thọ binh quân bằng
# cách nào? 


# Biểu đồ phân tán với màu sắc riêng biệt ứng với từng châu lục: 

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp, color = continent)) + 
  geom_point(alpha = 0.4)

# Hoặc bỏ hiển thị legend: 
gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp, color = continent)) + 
  geom_point(alpha = 0.4, show.legend = FALSE)

# Hoặc điều chỉnh vị trí của legend (kiểu 1): 
gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp, color = continent)) + 
  geom_point(alpha = 0.4) + 
  theme(legend.position = c(0.9, 0.3))

# Điều chỉnh vị trí legend (kiểu 2): 
gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp, color = continent)) + 
  geom_point(alpha = 0.4) + 
  theme(legend.position = "top")

# Biểu diễn kiểu khác: 

gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp, color = continent)) + 
  geom_point(alpha = 0.4) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/")

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.4) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/")

# Thay đổi vị trí của legend (kiểu 1): 

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.4) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  theme(legend.position = "top")

# Thay đổi vị trí củalLegend (kiểu 2): 

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.4) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  theme(legend.position = c(0.85, 0.25))

# Scatter Plot trên 5 panel riêng biệt: 
gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.4, show.legend = FALSE) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  facet_wrap(~ Continent)

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.4, show.legend = FALSE) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  facet_wrap(~ Continent, scales = "free") 

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.4, show.legend = FALSE) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  facet_wrap(~ Continent, nrow = 1, ncol = 5)

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.3, show.legend = FALSE) + 
  geom_smooth(method = "lm", color = "orange", fill = "orange", alpha = 0.2) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  facet_wrap(~ Continent, scales = "free") 

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.3, show.legend = FALSE) + 
  geom_smooth(method = "lm", color = "orange", fill = "orange", alpha = 0.2) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  facet_wrap(~ Continent)

# Tô màu theo ý muốn: 

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.3, show.legend = FALSE) + 
  geom_smooth(method = "lm", color = "green", fill = "green", alpha = 0.2) + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  facet_wrap(~ Continent) + 
  scale_color_manual(values = c("red", "blue", "purple", "#E69F00", "black"))

gapminder %>% 
  rename(Continent = continent) %>% 
  ggplot(aes(gdpPercap, lifeExp, color = Continent)) + 
  geom_point(alpha = 0.3, show.legend = FALSE) + 
  geom_smooth(method = "lm") + 
  scale_x_log10(labels = scales::dollar) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") + 
  theme(legend.position = c(0.85, 0.25))

# Đường hồi quy cho Americas nhưng vẫn để  các thằng kia (không phải Americas) làm nền: 
gapminder %>% 
  ggplot(aes(gdpPercap, lifeExp)) + 
  scale_x_log10(labels = scales::dollar) + 
  geom_point(data = gapminder %>% filter(continent != "Americas"), alpha =  0.1) + 
  geom_point(data = gapminder %>% filter(continent == "Americas"), color = "purple") + 
  geom_smooth(data = gapminder %>% filter(continent == "Americas"), 
              method = "lm", color = "orange", fill = "orange", alpha = 0.1) + 
  labs(x = "GDP per Capita", 
       y = "Life Expectancy", 
       title = "The Relationship Between GDP and Life Expectancy", 
       subtitle = "Note: Life Expectancy at birth, in years", 
       caption = "Data Source: http://www.gapminder.org/data/") 

#-------------------------------------------------------------------------------
#  Histogram / Density Plot
#  Tham khảo: http://www.pewsocialtrends.org/2016/05/11/americas-shrinking-middle-class-a-close-look-at-changes-within-metropolitan-areas/
#-------------------------------------------------------------------------------

# Đọc dữ liệu: 
library(readxl)
income <- read_excel("D:/GSO_R_Course/data_for_visualization/america_metro.xlsx", 
                     sheet = 3, skip = 7)

# Bỏ đi cột và dòng không cần thiết: 
income <- income %>% 
  select(-X__2) %>% 
  slice(-1)

# Đổi lại tên cho cột biến: 

names(income) <- c("Metro", "All_99", "Lower_99", "Middle_99", "Upper_99",
                   "All_14", "Lower_14", "Middle_14", "Upper_14")

# Xem qua dữ liệu: 
sapply(income, class)
##       Metro      All_99    Lower_99   Middle_99    Upper_99      All_14 
## "character"   "numeric"   "numeric"   "numeric"   "numeric"   "numeric" 
##    Lower_14   Middle_14    Upper_14 
##   "numeric"   "numeric"   "numeric"
# Ba kiểu Histogram: 
p1 <- income %>% 
  ggplot(aes(x = All_14)) +
  geom_histogram(binwidth = 1000) +
  labs(title = "Bin Width = 1000")

p2 <- income %>% 
  ggplot(aes(x = All_14)) +
  geom_histogram(binwidth = 5000, color = "red", fill = "blue", alpha = 0.3) +
  labs(title = "Bin Width = 5000")

p3 <- income %>% 
  ggplot(aes(x = All_14)) +
  geom_histogram(binwidth = 10000, color = "grey30", fill = "white") +
  labs(title = "Bin Width = 10000")

library(gridExtra)
grid.arrange(p1, p2, p3, nrow = 1, ncol = 3)

# Density Plot: 

income %>% 
  ggplot(aes(x = All_14)) +
  geom_density()

income %>% 
  ggplot(aes(x = All_14)) +
  geom_density(color = "grey40", fill = "grey80", size = 1.2)

# Biểu diễn đồng thời cả Histogram và Density: 

income %>% 
  ggplot(aes(x = All_14)) + 
  geom_density(color = "red", fill = "red", alpha = 0.2) + 
  geom_histogram(aes(y = ..density..), binwidth = 2000, 
                 fill = "blue", color = "blue", alpha = 0.2)

# So sánh các nhóm. Trước hết chuyển dữ liệu về long form: 
compare <- income %>%
  select(Metro, All_99, All_14) %>%
  gather(Year, Income, -Metro)

# Tham khảo thêm cách chọn màu ở http://colorbrewer2.org/: 
compare %>% 
  ggplot(aes(x = Income, fill = Year)) +
  geom_density(alpha = 0.4) + 
  scale_fill_manual(values = c("#1b9e77", "#7570b3"))

compare %>% 
  ggplot(aes(x = Income)) +
  geom_histogram(binwidth = 2000, color = "grey30", fill = "white") +
  facet_grid(Year ~ .)

class_comparison <- income %>%
  select(Metro, Lower_99:Upper_99, Lower_14:Upper_14) %>%
  gather(Class, Income, -Metro) %>%
  separate(Class, into = c("Class", "Year")) %>%
  mutate(Year = case_when(Year == "99" ~ "1999", 
                          Year != "99" ~ "2014"))


class_comparison %>% 
  ggplot(aes(x = Income, fill = Class, color = Class)) +
  geom_histogram(alpha = 0.2) +
  facet_wrap(~ Class, scales = "free_x")

class_comparison %>% 
  ggplot(aes(x = Income, fill = Class, color = Class)) +
  geom_histogram(alpha = 0.2) +
  facet_wrap(Year ~ Class, scales = "free_x")

class_comparison %>% 
  ggplot(aes(x = Income, fill = Class, color = Class)) +
  geom_histogram(alpha = 0.2) +
  facet_wrap(Year ~ Class, scales = "free")

class_comparison %>% 
  ggplot(aes(x = Income, fill = Year, color = Year)) +
  geom_histogram(alpha = 0.2) +
  facet_wrap(Year ~ Class, scales = "free")

# Thu nhập trung bình của từng nhóm theo năm: 
class_mean <- class_comparison %>%
  group_by(Class, Year) %>%
  summarise(Mean = mean(Income)) %>% 
  ungroup()

class_comparison %>% 
  ggplot(aes(x = Income, fill = Class, color = Class)) +
  geom_histogram(alpha = 0.2, show.legend = FALSE) +
  facet_wrap(Year ~ Class, scales = "free") + 
  geom_vline(data = class_mean, aes(xintercept = Mean), linetype = "dashed") + 
  ylab("Frequency") +
  xlab("Median Household Income (thousands)") +
  labs(title = "Median Household Income by Income Tier Across U.S. Metropolitan Areas",
       subtitle = "Average median income across 229 metros decreased from $67,863 in 1999 to $62,662 in 2014, representing an 8% loss in \nincome. The lower income class experienced the largest impact with a 11% decrease while the middle and upper class median \nhousehold income decreased by 6% and 8% respectively.",
       caption = "Source: Pew Research Center analysis of the \n2000 decennial census and 2014 American \nCommunity Survey (IPUMS)")

# Cải tiến: 

p <- class_comparison %>% 
  ggplot(aes(x = Income / 1000, fill = Class, color = Class)) +
  geom_histogram(alpha = 0.2, show.legend = FALSE) +
  facet_wrap(Year ~ Class, scales = "free") + 
  geom_vline(data = class_mean, aes(xintercept = Mean / 1000), linetype = "dashed") + 
  scale_x_continuous(labels = scales::dollar) +
  scale_y_continuous(limits = c(0, 58), expand = c(0, 0)) + 
  labs(x = "Median Household Income (thousands)", y = "Frequency", 
       title = "Median Household Income by Income Tier Across U.S. Metropolitan Areas", 
       subtitle = "Average median income across 229 metros decreased from $67,863 in 1999 to $62,662 in 2014, representing an 8% loss in \nincome. The lower income class experienced the largest impact with a 11% decrease while the middle and upper class median \nhousehold income decreased by 6% and 8% respectively.",
       caption = "Source: Pew Research Center analysis of the \n2000 decennial census and 2014 American \nCommunity Survey (IPUMS)")


p

# Cải tiến hơn nữa: 

class_mean <- class_mean %>%
  mutate(Label = paste0("$", prettyNum(round(Mean, 0), big.mark = ",")))

p + 
  geom_text(data = class_mean, 
            aes(x = Mean / 1000, y = 52, id = Class, label = Label),
            size = 3, hjust = -.1, show.legend = FALSE, color = "black")

#------------------------
#      Box Plot
#------------------------

# Vẽ đơn giản: 
iris %>% 
  ggplot(aes(Species, Sepal.Length)) + 
  geom_boxplot()

# Tô màu đỏ, chẳng hạn, nếu  muốn  nhấn mạnh đến outliers: 
iris %>% 
  ggplot(aes(Species, Sepal.Length)) + 
  geom_boxplot(outlier.color = "red")

# Hiển thị thêm mean nếu muốn: 
iris %>% 
  ggplot(aes(Species, Sepal.Length)) + 
  geom_boxplot(outlier.color = "red") + 
  stat_summary(fun.y = mean, colour = "blue", geom = "point")

# Boxplot của cả 4 biến số cho 3 loài hoa diên vĩ + đầy đủ chỉ dẫn: 
iris %>% 
  gather(Variable, Value, -Species) %>% 
  ggplot(aes(Species, Value, fill = Species, color = Species)) + 
  geom_boxplot(show.legend = FALSE, alpha = 0.4) + 
  facet_wrap(~ Variable, scales = "free") + 
  labs(x = NULL, 
       y = NULL, 
       title = "An Example of Boxplot", 
       caption = "Data Source: Iris data set by R. Fisher (1936)")

# Hoặc một kiểu khác:  
iris %>% 
  gather(Variable, Value, -Species) %>% 
  ggplot(aes(Species, Value, fill = Species, color = Species)) + 
  geom_boxplot(show.legend = FALSE, alpha = 0.4) + 
  facet_wrap(~ Variable, scales = "free") + 
  theme_bw() + 
  labs(x = NULL, 
       y = NULL, 
       title = "An Example of Boxplot", 
       caption = "Data Source: Iris data set by R. Fisher (1936)")

# So sánh với Histogram và Density: 
iris %>% 
  gather(Variable, Value, -Species) %>% 
  ggplot(aes(Value, fill = Species, color = Species)) + 
  geom_histogram(alpha = 0.3) + 
  facet_wrap(~ Variable) + 
  theme_bw() + 
  theme(legend.position = "top") + 
  labs(x = NULL, 
       y = NULL, 
       title = "An Example of Histogram", 
       caption = "Data Source: Iris data set by R. Fisher (1936)")

iris %>% 
  gather(Variable, Value, -Species) %>% 
  ggplot(aes(Value, fill = Species, color = Species)) + 
  geom_density(alpha = 0.3, show.legend = FALSE) + 
  facet_wrap(~ Variable, scales = "free") + 
  theme_bw() + 
  labs(x = NULL, 
       y = NULL, 
       title = "An Example of Density Plot", 
       caption = "Data Source: Iris data set by R. Fisher (1936)")

iris %>% 
  gather(Variable, Value, -Species) %>% 
  ggplot(aes(Value, fill = Species, color = Species)) + 
  geom_density(alpha = 0.3, show.legend = FALSE) + 
  facet_wrap(~ Variable) + 
  theme_bw() + 
  labs(x = NULL, 
       y = NULL, 
       title = "An Example of Density Plot", 
       caption = "Data Source: Iris data set by R. Fisher (1936)")