This dataset was downloaded from Facebook website, I took a look and I think that it would be fun for me to recall the ggplot2 visualization skills and walk myself through the entire EDA procedure.
suppressMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
suppressMessages(library(cowplot))
## Warning: package 'cowplot' was built under R version 3.5.2
df <- read_csv('~/Downloads/UnitedNationsPopulationdataset.csv')
glimpse(df)
## Rows: 374
## Columns: 5
## $ `Country Name` <chr> "Aruba", "Afghanistan", "Angola", "Albania", "United…
## $ `Country Code` <chr> "ABW", "AFG", "AGO", "ALB", "ARE", "ARG", "ARM", "AT…
## $ Region <chr> "The Americas", "Asia", "Africa", "Europe", "Middle …
## $ Year <dbl> 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980…
## $ `Fertility Rate` <dbl> 4.820, 7.450, 7.379, 6.186, 6.928, 3.109, 4.550, 4.4…
Clearly, we got five features in the dataset and three are strings and two are numeric, so the following steps of analysis is simple.
df <- df %>% rename(Country.Code=`Country Code`, Country.Name=`Country Name`, Fertility.Rate=`Fertility Rate`)
df <- df %>% mutate(Year=as.factor(Year))
df.long <- df %>% spread(Year, Fertility.Rate)
df.long <- df.long %>% rename(Y1980=`1980`,Y2018=`2018`)
glimpse(df)
## Rows: 374
## Columns: 5
## $ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "United A…
## $ Country.Code <chr> "ABW", "AFG", "AGO", "ALB", "ARE", "ARG", "ARM", "ATG"…
## $ Region <chr> "The Americas", "Asia", "Africa", "Europe", "Middle Ea…
## $ Year <fct> 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980, 1980, …
## $ Fertility.Rate <dbl> 4.820, 7.450, 7.379, 6.186, 6.928, 3.109, 4.550, 4.425…
glimpse(df.long)
## Rows: 187
## Columns: 5
## $ Country.Name <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Antigua …
## $ Country.Code <chr> "AFG", "ALB", "DZA", "AGO", "ATG", "ARG", "ARM", "ABW", …
## $ Region <chr> "Asia", "Europe", "Africa", "Africa", "The Americas", "T…
## $ Y1980 <dbl> 7.450, 6.186, 7.524, 7.379, 4.425, 3.109, 4.550, 4.820, …
## $ Y2018 <dbl> 5.050, 1.771, 2.893, 6.165, 2.088, 2.335, 1.553, 1.669, …
df %>% group_by(Year) %>% arrange(desc(Fertility.Rate)) %>% top_n(10) %>% ungroup() %>% ggplot(aes(x=reorder(Country.Name, Fertility.Rate),y=Fertility.Rate, fill=Country.Name))+geom_col()+facet_grid(~Year)+
labs(x='',fill=NA)+theme_classic()+theme(
legend.position = 'none',
axis.text.x = element_text(angle = 90,hjust = 1)
)
p1<-df %>% ggplot(aes(x=Region, y=Fertility.Rate, fill=Year))+geom_boxplot()+labs(x='')+
theme_classic()+theme(
axis.text.x = element_text(angle = 90,hjust = 1)
)
p2<-df %>% group_by(Region,Year) %>% summarise(Fertility.Rate=mean(Fertility.Rate)) %>%
ggplot(aes(x=Region,y=Year,fill=Fertility.Rate))+geom_tile()+
scale_fill_continuous(low="blue",high="red")+theme_classic()+theme(
axis.text.x = element_text(angle = 90,hjust = 1)
)
## `summarise()` regrouping output by 'Region' (override with `.groups` argument)
plot_grid(p1,p2)
p1<-df.long %>% mutate(status=case_when(Y2018>=Y1980 ~ 'Increase',Y2018<Y1980 ~ 'Decrease'),change=Y2018-Y1980) %>% group_by(change) %>% top_n(10) %>% ggplot(aes(x=reorder(Country.Name,-change),y=change,fill=status))+geom_col()+
labs(x='')+theme_classic()+theme(
axis.text.x = element_blank()
)
## Selecting by change
p2<-df.long %>% ggplot(aes(x=Y2018,y=Y1980,color=Region))+geom_point()+theme_classic()
plot_grid(p1,p2,nrow = 2)
library(ggalluvial)
df.long %>% mutate(status=case_when(Y2018>=Y1980 ~ 'Increase',Y2018<Y1980 ~ 'Decrease')) %>% group_by(Region,status) %>% count() %>% ggplot(aes(y=n,axis1=Region,axis2=status))+geom_flow(aes(fill=status),width = .4)+geom_stratum(width = .4)+geom_text(stat = "stratum", infer.label = TRUE)+theme_classic()
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.5.0 (2018-04-23)
## os macOS High Sierra 10.13.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz Asia/Shanghai
## date 2020-07-06
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.5.2)
## backports 1.1.8 2020-06-17 [1] CRAN (R 3.5.0)
## blob 1.2.1 2020-01-20 [1] CRAN (R 3.5.2)
## broom 0.5.6 2020-04-20 [1] CRAN (R 3.5.0)
## callr 3.4.3 2020-03-28 [1] CRAN (R 3.5.0)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.5.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.5.2)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.5.2)
## cowplot * 1.0.0 2019-07-11 [1] CRAN (R 3.5.2)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.5.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.5.2)
## dbplyr 1.4.4 2020-05-27 [1] CRAN (R 3.5.0)
## desc 1.2.0 2018-05-01 [1] CRAN (R 3.5.0)
## devtools 2.3.0 2020-04-10 [1] CRAN (R 3.5.0)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.5.2)
## dplyr * 1.0.0 2020-05-29 [1] CRAN (R 3.5.0)
## ellipsis 0.3.1 2020-05-15 [1] CRAN (R 3.5.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.5.2)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.5.2)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.5.2)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.5.2)
## fs 1.4.1 2020-04-04 [1] CRAN (R 3.5.0)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.5.0)
## ggalluvial * 0.11.3 2020-04-16 [1] CRAN (R 3.5.0)
## ggplot2 * 3.3.2 2020-06-19 [1] CRAN (R 3.5.0)
## glue 1.4.1 2020-05-13 [1] CRAN (R 3.5.0)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.5.2)
## haven 2.3.1 2020-06-01 [1] CRAN (R 3.5.0)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.5.2)
## htmltools 0.5.0 2020-06-16 [1] CRAN (R 3.5.0)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.5.2)
## jsonlite 1.7.0 2020-06-25 [1] CRAN (R 3.5.0)
## knitr 1.29 2020-06-23 [1] CRAN (R 3.5.0)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.5.0)
## lattice 0.20-41 2020-04-02 [1] CRAN (R 3.5.0)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.5.2)
## lubridate 1.7.9 2020-06-08 [1] CRAN (R 3.5.0)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.5.0)
## memoise 1.1.0 2017-04-21 [1] CRAN (R 3.5.0)
## modelr 0.1.8 2020-05-19 [1] CRAN (R 3.5.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.5.0)
## nlme 3.1-145 2020-03-04 [1] CRAN (R 3.5.2)
## pillar 1.4.4 2020-05-05 [1] CRAN (R 3.5.0)
## pkgbuild 1.0.8 2020-05-07 [1] CRAN (R 3.5.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.5.2)
## pkgload 1.1.0 2020-05-29 [1] CRAN (R 3.5.0)
## prettyunits 1.1.1 2020-01-24 [1] CRAN (R 3.5.2)
## processx 3.4.2 2020-02-09 [1] CRAN (R 3.5.2)
## ps 1.3.3 2020-05-08 [1] CRAN (R 3.5.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.5.0)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.5.2)
## Rcpp 1.0.4.6 2020-04-09 [1] CRAN (R 3.5.0)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.5.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.5.2)
## remotes 2.1.1 2020-02-15 [1] CRAN (R 3.5.2)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.5.2)
## rlang 0.4.6 2020-05-02 [1] CRAN (R 3.5.0)
## rmarkdown 2.3 2020-06-18 [1] CRAN (R 3.5.0)
## rprojroot 1.3-2 2018-01-03 [1] CRAN (R 3.5.0)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.5.2)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.5.2)
## scales 1.1.1 2020-05-11 [1] CRAN (R 3.5.0)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.5.0)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.5.2)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.5.2)
## testthat 2.3.2 2020-03-02 [1] CRAN (R 3.5.2)
## tibble * 3.0.1 2020-04-20 [1] CRAN (R 3.5.0)
## tidyr * 1.1.0 2020-05-20 [1] CRAN (R 3.5.0)
## tidyselect 1.1.0 2020-05-11 [1] CRAN (R 3.5.0)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.5.2)
## usethis 1.6.1 2020-04-29 [1] CRAN (R 3.5.0)
## utf8 1.1.4 2018-05-24 [1] CRAN (R 3.5.0)
## vctrs 0.3.1 2020-06-05 [1] CRAN (R 3.5.0)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.5.0)
## xfun 0.15 2020-06-21 [1] CRAN (R 3.5.0)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.5.0)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.5.2)
##
## [1] /Library/Frameworks/R.framework/Versions/3.5/Resources/library