Dataset Overview

This dataset was downloaded from Facebook website, I took a look and I think that it would be fun for me to recall the ggplot2 visualization skills and walk myself through the entire EDA procedure.

suppressMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
suppressMessages(library(cowplot))
## Warning: package 'cowplot' was built under R version 3.5.2
df <- read_csv('~/Downloads/UnitedNationsPopulationdataset.csv')
glimpse(df)
## Rows: 374
## Columns: 5
## $ `Country Name`   <chr> "Aruba", "Afghanistan", "Angola", "Albania", "United…
## $ `Country Code`   <chr> "ABW", "AFG", "AGO", "ALB", "ARE", "ARG", "ARM", "AT…
## $ Region           <chr> "The Americas", "Asia", "Africa", "Europe", "Middle …
## $ Year             <dbl> 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980…
## $ `Fertility Rate` <dbl> 4.820, 7.450, 7.379, 6.186, 6.928, 3.109, 4.550, 4.4…

Clearly, we got five features in the dataset and three are strings and two are numeric, so the following steps of analysis is simple.

Data pre-processing

df <- df %>% rename(Country.Code=`Country Code`, Country.Name=`Country Name`, Fertility.Rate=`Fertility Rate`)
df <- df %>% mutate(Year=as.factor(Year))

df.long <- df %>% spread(Year, Fertility.Rate)
df.long <- df.long %>% rename(Y1980=`1980`,Y2018=`2018`)
glimpse(df)
## Rows: 374
## Columns: 5
## $ Country.Name   <chr> "Aruba", "Afghanistan", "Angola", "Albania", "United A…
## $ Country.Code   <chr> "ABW", "AFG", "AGO", "ALB", "ARE", "ARG", "ARM", "ATG"…
## $ Region         <chr> "The Americas", "Asia", "Africa", "Europe", "Middle Ea…
## $ Year           <fct> 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980, …
## $ Fertility.Rate <dbl> 4.820, 7.450, 7.379, 6.186, 6.928, 3.109, 4.550, 4.425…
glimpse(df.long)
## Rows: 187
## Columns: 5
## $ Country.Name <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua …
## $ Country.Code <chr> "AFG", "ALB", "DZA", "AGO", "ATG", "ARG", "ARM", "ABW", …
## $ Region       <chr> "Asia", "Europe", "Africa", "Africa", "The Americas", "T…
## $ Y1980        <dbl> 7.450, 6.186, 7.524, 7.379, 4.425, 3.109, 4.550, 4.820, …
## $ Y2018        <dbl> 5.050, 1.771, 2.893, 6.165, 2.088, 2.335, 1.553, 1.669, …

EDA

Top 10 countries ranked by Fertility Rate

  df %>% group_by(Year) %>% arrange(desc(Fertility.Rate)) %>% top_n(10) %>% ungroup() %>% ggplot(aes(x=reorder(Country.Name, Fertility.Rate),y=Fertility.Rate, fill=Country.Name))+geom_col()+facet_grid(~Year)+
  labs(x='',fill=NA)+theme_classic()+theme(
    legend.position = 'none',
    axis.text.x = element_text(angle = 90,hjust = 1)
  )

Average Fertility Rate in Regions

 p1<-df %>% ggplot(aes(x=Region, y=Fertility.Rate, fill=Year))+geom_boxplot()+labs(x='')+
  theme_classic()+theme(
    axis.text.x = element_text(angle = 90,hjust = 1)
  )
p2<-df %>% group_by(Region,Year) %>% summarise(Fertility.Rate=mean(Fertility.Rate)) %>%
  ggplot(aes(x=Region,y=Year,fill=Fertility.Rate))+geom_tile()+
  scale_fill_continuous(low="blue",high="red")+theme_classic()+theme(
    axis.text.x = element_text(angle = 90,hjust = 1)
  )
## `summarise()` regrouping output by 'Region' (override with `.groups` argument)
plot_grid(p1,p2)

Fertility Rate Change in Countries

p1<-df.long %>% mutate(status=case_when(Y2018>=Y1980 ~ 'Increase',Y2018<Y1980 ~ 'Decrease'),change=Y2018-Y1980) %>% group_by(change) %>% top_n(10) %>% ggplot(aes(x=reorder(Country.Name,-change),y=change,fill=status))+geom_col()+
  labs(x='')+theme_classic()+theme(
    axis.text.x = element_blank()
  )
## Selecting by change
p2<-df.long %>% ggplot(aes(x=Y2018,y=Y1980,color=Region))+geom_point()+theme_classic()
plot_grid(p1,p2,nrow = 2)

Countries with Increased & Decreased Rate

library(ggalluvial)
  df.long %>% mutate(status=case_when(Y2018>=Y1980 ~ 'Increase',Y2018<Y1980 ~ 'Decrease')) %>% group_by(Region,status) %>% count() %>% ggplot(aes(y=n,axis1=Region,axis2=status))+geom_flow(aes(fill=status),width = .4)+geom_stratum(width = .4)+geom_text(stat = "stratum", infer.label = TRUE)+theme_classic()

Session Info

devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.5.0 (2018-04-23)
##  os       macOS High Sierra 10.13.6   
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       Asia/Shanghai               
##  date     2020-07-06                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version date       lib source        
##  assertthat    0.2.1   2019-03-21 [1] CRAN (R 3.5.2)
##  backports     1.1.8   2020-06-17 [1] CRAN (R 3.5.0)
##  blob          1.2.1   2020-01-20 [1] CRAN (R 3.5.2)
##  broom         0.5.6   2020-04-20 [1] CRAN (R 3.5.0)
##  callr         3.4.3   2020-03-28 [1] CRAN (R 3.5.0)
##  cellranger    1.1.0   2016-07-27 [1] CRAN (R 3.5.0)
##  cli           2.0.2   2020-02-28 [1] CRAN (R 3.5.2)
##  colorspace    1.4-1   2019-03-18 [1] CRAN (R 3.5.2)
##  cowplot     * 1.0.0   2019-07-11 [1] CRAN (R 3.5.2)
##  crayon        1.3.4   2017-09-16 [1] CRAN (R 3.5.0)
##  DBI           1.1.0   2019-12-15 [1] CRAN (R 3.5.2)
##  dbplyr        1.4.4   2020-05-27 [1] CRAN (R 3.5.0)
##  desc          1.2.0   2018-05-01 [1] CRAN (R 3.5.0)
##  devtools      2.3.0   2020-04-10 [1] CRAN (R 3.5.0)
##  digest        0.6.25  2020-02-23 [1] CRAN (R 3.5.2)
##  dplyr       * 1.0.0   2020-05-29 [1] CRAN (R 3.5.0)
##  ellipsis      0.3.1   2020-05-15 [1] CRAN (R 3.5.0)
##  evaluate      0.14    2019-05-28 [1] CRAN (R 3.5.2)
##  fansi         0.4.1   2020-01-08 [1] CRAN (R 3.5.2)
##  farver        2.0.3   2020-01-16 [1] CRAN (R 3.5.2)
##  forcats     * 0.5.0   2020-03-01 [1] CRAN (R 3.5.2)
##  fs            1.4.1   2020-04-04 [1] CRAN (R 3.5.0)
##  generics      0.0.2   2018-11-29 [1] CRAN (R 3.5.0)
##  ggalluvial  * 0.11.3  2020-04-16 [1] CRAN (R 3.5.0)
##  ggplot2     * 3.3.2   2020-06-19 [1] CRAN (R 3.5.0)
##  glue          1.4.1   2020-05-13 [1] CRAN (R 3.5.0)
##  gtable        0.3.0   2019-03-25 [1] CRAN (R 3.5.2)
##  haven         2.3.1   2020-06-01 [1] CRAN (R 3.5.0)
##  hms           0.5.3   2020-01-08 [1] CRAN (R 3.5.2)
##  htmltools     0.5.0   2020-06-16 [1] CRAN (R 3.5.0)
##  httr          1.4.1   2019-08-05 [1] CRAN (R 3.5.2)
##  jsonlite      1.7.0   2020-06-25 [1] CRAN (R 3.5.0)
##  knitr         1.29    2020-06-23 [1] CRAN (R 3.5.0)
##  labeling      0.3     2014-08-23 [1] CRAN (R 3.5.0)
##  lattice       0.20-41 2020-04-02 [1] CRAN (R 3.5.0)
##  lifecycle     0.2.0   2020-03-06 [1] CRAN (R 3.5.2)
##  lubridate     1.7.9   2020-06-08 [1] CRAN (R 3.5.0)
##  magrittr      1.5     2014-11-22 [1] CRAN (R 3.5.0)
##  memoise       1.1.0   2017-04-21 [1] CRAN (R 3.5.0)
##  modelr        0.1.8   2020-05-19 [1] CRAN (R 3.5.0)
##  munsell       0.5.0   2018-06-12 [1] CRAN (R 3.5.0)
##  nlme          3.1-145 2020-03-04 [1] CRAN (R 3.5.2)
##  pillar        1.4.4   2020-05-05 [1] CRAN (R 3.5.0)
##  pkgbuild      1.0.8   2020-05-07 [1] CRAN (R 3.5.0)
##  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 3.5.2)
##  pkgload       1.1.0   2020-05-29 [1] CRAN (R 3.5.0)
##  prettyunits   1.1.1   2020-01-24 [1] CRAN (R 3.5.2)
##  processx      3.4.2   2020-02-09 [1] CRAN (R 3.5.2)
##  ps            1.3.3   2020-05-08 [1] CRAN (R 3.5.0)
##  purrr       * 0.3.4   2020-04-17 [1] CRAN (R 3.5.0)
##  R6            2.4.1   2019-11-12 [1] CRAN (R 3.5.2)
##  Rcpp          1.0.4.6 2020-04-09 [1] CRAN (R 3.5.0)
##  readr       * 1.3.1   2018-12-21 [1] CRAN (R 3.5.0)
##  readxl        1.3.1   2019-03-13 [1] CRAN (R 3.5.2)
##  remotes       2.1.1   2020-02-15 [1] CRAN (R 3.5.2)
##  reprex        0.3.0   2019-05-16 [1] CRAN (R 3.5.2)
##  rlang         0.4.6   2020-05-02 [1] CRAN (R 3.5.0)
##  rmarkdown     2.3     2020-06-18 [1] CRAN (R 3.5.0)
##  rprojroot     1.3-2   2018-01-03 [1] CRAN (R 3.5.0)
##  rstudioapi    0.11    2020-02-07 [1] CRAN (R 3.5.2)
##  rvest         0.3.5   2019-11-08 [1] CRAN (R 3.5.2)
##  scales        1.1.1   2020-05-11 [1] CRAN (R 3.5.0)
##  sessioninfo   1.1.1   2018-11-05 [1] CRAN (R 3.5.0)
##  stringi       1.4.6   2020-02-17 [1] CRAN (R 3.5.2)
##  stringr     * 1.4.0   2019-02-10 [1] CRAN (R 3.5.2)
##  testthat      2.3.2   2020-03-02 [1] CRAN (R 3.5.2)
##  tibble      * 3.0.1   2020-04-20 [1] CRAN (R 3.5.0)
##  tidyr       * 1.1.0   2020-05-20 [1] CRAN (R 3.5.0)
##  tidyselect    1.1.0   2020-05-11 [1] CRAN (R 3.5.0)
##  tidyverse   * 1.3.0   2019-11-21 [1] CRAN (R 3.5.2)
##  usethis       1.6.1   2020-04-29 [1] CRAN (R 3.5.0)
##  utf8          1.1.4   2018-05-24 [1] CRAN (R 3.5.0)
##  vctrs         0.3.1   2020-06-05 [1] CRAN (R 3.5.0)
##  withr         2.2.0   2020-04-20 [1] CRAN (R 3.5.0)
##  xfun          0.15    2020-06-21 [1] CRAN (R 3.5.0)
##  xml2          1.3.2   2020-04-23 [1] CRAN (R 3.5.0)
##  yaml          2.2.1   2020-02-01 [1] CRAN (R 3.5.2)
## 
## [1] /Library/Frameworks/R.framework/Versions/3.5/Resources/library