merge multiple dataframe)ggplot, plotly,
gganimate)pacman::p_load(
rio,
here,
tidyverse,
lubridate,
plotly,
gganimate,
ggExtra, # marginal histogram
ggalt, # scatterplot encircle
ggcorrplot # corrlogam
)
# GDP
wb_gdp <- import(here("data", "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_5358382.xls"), which = "Data", skip = 3)
wb_metadata <- import(here("data", "API_NY.GDP.MKTP.CD_DS2_en_excel_v2_5358382.xls"), which = "Metadata - Countries")
# Population
wb_pop <- import(here("data", "API_SP.POP.TOTL_DS2_en_excel_v2_5358476.xls"), which = "Data", skip = 3)
# Life expectancy at birth
wb_life <- import(here("data", "API_SP.DYN.LE00.IN_DS2_en_excel_v2_5358902.xls"), which = "Data", skip = 3)
# GDP per capita
wb_gdpcap <- import(here("data", "API_NY.GDP.PCAP.CD_DS2_en_excel_v2_5358450.xls"), which = "Data", skip = 3)
# GDP growth (annual %)
wb_gdpgrowth <- import(here("data", "API_NY.GDP.MKTP.KD.ZG_DS2_en_excel_v2_5358368.xls"), which = "Data", skip = 3)
# merge 2 datasets
dt <- merge(wb_gdp, wb_metadata, by = 'Country Code')
# transform data from wide to long
dt <- dt %>%
pivot_longer(
cols = c(5:66),
names_to = "year",
values_to = "gdp"
)
# drop columns
dt$`Indicator Code`<- NULL
dt$SpecialNotes <- NULL
dt$TableName <- NULL
dt$`Indicator Name` <- NULL
Note that we need to transform variable class for ggplot:
# transform variable class
dt$year <- as.integer(dt$year)
dt$`Country Code` <- as.factor(dt$`Country Code`)
dt$`Country Name` <- as.factor(dt$`Country Name`)
# check na value
sum(is.na(dt))
## [1] 9288
Tranform data from long to wide
# Population
wb_pop <- wb_pop %>%
pivot_longer(
cols = c(5:66),
names_to = "year",
values_to = "pop"
) %>%
select(`Country Code`, year, pop)
# Life expectancy at birth
wb_life <- wb_life %>%
pivot_longer(
cols = c(5:66),
names_to = "year",
values_to = "lifeExp"
) %>%
select(`Country Code`, year, lifeExp)
# GDP per capita
wb_gdpcap <- wb_gdpcap %>%
pivot_longer(
cols = c(5:66),
names_to = "year",
values_to = "gdpPercap"
) %>%
select(`Country Code`, year, gdpPercap)
# GDP growth (annual %)
wb_gdpgrowth <- wb_gdpgrowth %>%
pivot_longer(
cols = c(5:66),
names_to = "year",
values_to = "gdpGrowth"
) %>%
select(`Country Code`, year, gdpGrowth)
Add cols of pop, life expectancy, gdp per cap, and gdp growth to dataframe of gdp
df_list <- list(dt, wb_pop, wb_life, wb_gdpcap, wb_gdpgrowth)
full_dt <- Reduce(function(x, y) merge(x, y, all=TRUE), df_list) ## using R base
str(full_dt)
## 'data.frame': 16492 obs. of 10 variables:
## $ Country Code: Factor w/ 266 levels "ABW","AFE","AFG",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : chr "1960" "1961" "1962" "1963" ...
## $ Country Name: Factor w/ 265 levels "Afghanistan",..: 13 13 13 13 13 13 13 13 13 13 ...
## $ Region : chr "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" "Latin America & Caribbean" ...
## $ IncomeGroup : chr "High income" "High income" "High income" "High income" ...
## $ gdp : num NA NA NA NA NA NA NA NA NA NA ...
## $ pop : num 54608 55811 56682 57475 58178 ...
## $ lifeExp : num 64.2 64.5 64.8 65.1 65.3 ...
## $ gdpPercap : num NA NA NA NA NA NA NA NA NA NA ...
## $ gdpGrowth : num NA NA NA NA NA NA NA NA NA NA ...
full_dt <- full_dt %>%
janitor::clean_names() # clean columns name
library(ggplot2)
theme_set(theme_bw()) # pre-set the bw theme.
full_dt <- full_dt %>% filter(year >= 2020) %>% filter (!is.na(region))
g1 <- ggplot(full_dt, aes(x = log(gdp_percap), y = life_exp)) +
geom_point(aes(col = region, size = pop)) +
geom_smooth(method = "loess", se=F, formula = y ~ x) +
ylim(c(49, 89)) + # min-max value of life_exp, help the map easy to follow / see
labs(subtitle="gdp_percap ~ life_exp",
y="lifeExp",
x="gdpPercap",
title="Scatterplot",
caption = "Source: WB 2023")
plot(g1)
full_dt_select <- full_dt %>% filter(life_exp <= 55)
g2 <- ggplot(full_dt, aes(x=log(gdp_percap), y=life_exp)) +
geom_point(aes(col=region, size=pop)) +
geom_smooth(method="loess", se=F, formula = y ~ x) +
xlim(c(5, 13)) +
ylim(c(49, 89)) +
geom_encircle(aes(x=log(gdp_percap), y=life_exp),
data=full_dt_select,
color="red",
size=2,
expand=0.08) + # The color and size (thickness) of the curve
labs(subtitle="gdp_percap ~ life_exp",
y="lifeExp",
x="gdpPercap",
title="Scatterplot",
caption = "Source: WB 2023")
plot(g2)
theme_set(theme_bw()) # pre-set the bw theme
g3 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
geom_point() +
geom_smooth(method = lm, se = FALSE, formula = y ~ x)
labs(subtitle = "gdp per cap ~ life exp",
y="gdp_percap",
x="life_exp",
title = "Scatterplot with overlapping points",
caption = "Source: WB 2023")
## $y
## [1] "gdp_percap"
##
## $x
## [1] "life_exp"
##
## $title
## [1] "Scatterplot with overlapping points"
##
## $subtitle
## [1] "gdp per cap ~ life exp"
##
## $caption
## [1] "Source: WB 2023"
##
## attr(,"class")
## [1] "labels"
g3
The original data has more than 200 data points but the chart seems to
display fewer points. This is because there are many overlapping points
appearing as a single dot. The fact that both gdp_percap and life_exp
are integers in the source dataset made it all the more convenient to
hide this detail. So just be extra careful the next time you make
scatterplot with integers.
So how to handle this? There are few options. We can make a jitter plot with jitter_geom()
g3 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
geom_point() +
geom_smooth(method = lm, se=F, formula = y ~ x) +
geom_jitter(width = .5, size = 1) + #the overlapping points are randomly jittered around its original position based on a threshold controlled by the width argument.
labs(subtitle = "gdp per cap ~ life exp",
y="gdp_percap",
x="life_exp",
title = "Scatterplot with overlapping points",
caption = "Source: WB 2023")
g3
The second option to overcome the problem of data points overlap is to use what is called a counts chart. Whereever there is more points overlap, the size of the circle gets bigger.
g4 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
geom_smooth(method = lm, se=F, formula = y ~ x) +
geom_count(col="tomato3", show.legend = F) +
labs(subtitle = "gdp") +
labs(subtitle = "gdp per cap ~ life exp",
y="gdp_percap",
x="life_exp",
title = "Scatterplot with overlapping points - count plot",
caption = "Source: WB 2023")
g4
library(ggplot2)
library()
g4 <- ggplot(full_dt, aes(log(gdp_percap), life_exp)) +
geom_smooth(method = lm, se=F, formula = y ~ x) +
geom_count()
ggMarginal(g4, type = "histogram", fill = "transparent")
# ggMarginal(g4, type = "boxplot", fill = "transparent")
# ggMarginal(g4, type = "density", fill = "transparent")
## Prepare data
# replace na value with mean value
# option 1
correlation <- full_dt %>% select(-c(1:5, 10))
for(i in 1:ncol(correlation)){
correlation[is.na(correlation[,i]), i] <- mean(correlation[,i], na.rm = TRUE)
}
# option 2
t <- full_dt %>% select(-c(1:5, 10))
library(imputeTS)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
t <- na_mean(t)
correlation <- scale(correlation)
corr <- round(cor(correlation), 1)
g1 <- ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method = "circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
g1