This post demonstrates the use of tidyverse package to explore the data set. In addition, ggplot2 will support the exploration.
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## -- Conflicts --------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gapminder) # this package contains the data
## Warning: package 'gapminder' was built under R version 3.6.3
df <- gapminder
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
There are six variables: ** country
** continent
** year ** lifeExp: life expectancy at birth ** pop: total population ** gdpPercap: per-capita GDP
head(df)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
df %>%
summarise(nb_country = n_distinct(country),
nb_continent = n_distinct(continent),
nb_year= n_distinct(year))
## # A tibble: 1 x 3
## nb_country nb_continent nb_year
## <int> <int> <int>
## 1 142 5 12
Although the variable year in the raw data is recorded as interger, we will consider this variable as categorical. There are only 12 unique years. We can see the earliest year is 1952, and it seems that the data are recorded every five years.
3A. By variable
Finding the highest/smallest values. Example: Top 10 countries with the highest life expentencay (for a specific year)
df %>%
filter(year == 2007) %>%
select(continent,country, lifeExp) %>%
arrange(desc(lifeExp)) %>%
head(10)
## # A tibble: 10 x 3
## continent country lifeExp
## <fct> <fct> <dbl>
## 1 Asia Japan 82.6
## 2 Asia Hong Kong, China 82.2
## 3 Europe Iceland 81.8
## 4 Europe Switzerland 81.7
## 5 Oceania Australia 81.2
## 6 Europe Spain 80.9
## 7 Europe Sweden 80.9
## 8 Asia Israel 80.7
## 9 Europe France 80.7
## 10 Americas Canada 80.7
Similarly, we can find the 10 countries with the lowest life expectency.
df %>%
filter(year == 2007) %>%
select(continent, country, lifeExp) %>%
arrange(lifeExp) %>%
head(10)
## # A tibble: 10 x 3
## continent country lifeExp
## <fct> <fct> <dbl>
## 1 Africa Swaziland 39.6
## 2 Africa Mozambique 42.1
## 3 Africa Zambia 42.4
## 4 Africa Sierra Leone 42.6
## 5 Africa Lesotho 42.6
## 6 Africa Angola 42.7
## 7 Africa Zimbabwe 43.5
## 8 Asia Afghanistan 43.8
## 9 Africa Central African Republic 44.7
## 10 Africa Liberia 45.7
Top 10 GDP per capita by country
df %>%
filter(year == 2007) %>%
select(continent,country, gdpPercap) %>%
arrange(desc(gdpPercap)) %>%
head(10)
## # A tibble: 10 x 3
## continent country gdpPercap
## <fct> <fct> <dbl>
## 1 Europe Norway 49357.
## 2 Asia Kuwait 47307.
## 3 Asia Singapore 47143.
## 4 Americas United States 42952.
## 5 Europe Ireland 40676.
## 6 Asia Hong Kong, China 39725.
## 7 Europe Switzerland 37506.
## 8 Europe Netherlands 36798.
## 9 Americas Canada 36319.
## 10 Europe Iceland 36181.
For each continent, what are the top 3 countries with hisghest GDP
df %>%
filter(year == 1997 & continent != "Oceania") %>%
select(continent, country, gdpPercap,pop,lifeExp) %>%
group_by(continent) %>%
arrange(continent,desc(gdpPercap)) %>%
top_n(3, gdpPercap)
## # A tibble: 12 x 5
## # Groups: continent [4]
## continent country gdpPercap pop lifeExp
## <fct> <fct> <dbl> <int> <dbl>
## 1 Africa Gabon 14723. 1126189 60.5
## 2 Africa Libya 9467. 4759670 71.6
## 3 Africa Botswana 8647. 1536536 52.6
## 4 Americas United States 35767. 272911760 76.8
## 5 Americas Canada 28955. 30305843 78.6
## 6 Americas Puerto Rico 16999. 3759430 74.9
## 7 Asia Kuwait 40301. 1765345 76.2
## 8 Asia Singapore 33519. 3802309 77.2
## 9 Asia Japan 28817. 125956499 80.7
## 10 Europe Norway 41283. 4405672 78.3
## 11 Europe Switzerland 32135. 7193761 79.4
## 12 Europe Netherlands 30246. 15604464 78.0
df %>%
filter(year == 1997 & continent != "Oceania") %>%
select(continent, country, gdpPercap,pop,lifeExp) %>%
group_by(continent) %>%
arrange(continent,desc(gdpPercap)) %>%
top_n(3, gdpPercap) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size = pop, color=country, shape = continent)) +
geom_point(alpha=0.7) +
scale_size(range = c(5,20), name = "Population (M)")
df %>%
filter(year == 2007 & continent != "Oceania") %>%
ggplot(aes(x=gdpPercap, y=lifeExp, size = pop, color=continent)) +
geom_point(alpha=0.7) +
scale_size(range = c(.5, 24), name="Population (M)")
df %>%
filter(continent == "Oceania") %>%
ggplot(aes(x = year,y = lifeExp,color = country)) +
geom_line( size = 1.0)+
ggtitle("Life expectency in Ocenia from 1952 to 1997")
3.Basic statistics: mean, median, max, min
df%>%
filter(year == 2007 & continent != "Oceania") %>%
group_by(continent) %>%
summarise(med = median(lifeExp),
avg = mean(lifeExp),
min = min(lifeExp),
max = max(lifeExp))
## # A tibble: 4 x 5
## continent med avg min max
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Africa 52.9 54.8 39.6 76.4
## 2 Americas 72.9 73.6 60.9 80.7
## 3 Asia 72.4 70.7 43.8 82.6
## 4 Europe 78.6 77.6 71.8 81.8
It maybe better to visualize that with box-plot
df%>%
filter(year == 2007 & continent != "Oceania") %>%
group_by(continent) %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot(outlier.colour = "red") +
geom_jitter(position = position_jitter(width = 0.1, height = 0),
alpha = 0.75)
# GDP per capita less than 50000 ,lifeExp and Continent
df %>%
filter(year == 2007 & continent != "Oceania") %>%
ggplot(aes(log(gdpPercap),
lifeExp,
col = continent)) +
geom_point(alpha = 0.5) +
geom_smooth(method = lm) + facet_wrap(~continent)
## `geom_smooth()` using formula 'y ~ x'
# GPD per capita less than 50000 ,lifeExp and gdpPercap
df %>%
filter(year == 2007 & continent != "Oceania" ) %>%
ggplot(aes(log(gdpPercap),lifeExp, col = lifeExp)) +
geom_point(alpha = 0.5) +
geom_smooth(method = lm) +
facet_wrap(~continent)
## `geom_smooth()` using formula 'y ~ x'
Compare the average life expectency after 40
ggplot(data = df %>%
filter(year%in% c("1957", "1997") & continent != "Oceania") %>%
group_by(year, continent) %>%
summarise(Avg_life_expectancy = mean(lifeExp)),
aes(x = continent, y = Avg_life_expectancy, fill = as.factor(year))) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y = "Average Life Expectancy", fill = "Year") +
scale_fill_manual(values = c("lightblue","darkblue"))
What countries have grown the most over the last 10 years?
top5_countries <- df %>%
select(continent,year, country, gdpPercap) %>%
filter(year %in% c("1997", "2007"))%>%
pivot_wider(names_from=year, values_from = gdpPercap) %>%
mutate(gdp_difference = `2007` - `1997`) %>%
top_n(5,gdp_difference)
top5_countries
## # A tibble: 5 x 5
## continent country `1997` `2007` gdp_difference
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 Asia Bahrain 20292. 29796. 9504.
## 2 Europe Finland 23724. 33207. 9483.
## 3 Asia Hong Kong, China 28378. 39725. 11347.
## 4 Europe Ireland 24522. 40676. 16154.
## 5 Asia Singapore 33519. 47143. 13624.
top_countries <- top5_countries$country
df %>% filter(country %in% top_countries) %>%
ggplot(aes(x = year, y = gdpPercap, col = country))+
geom_line(size = 1)
Generate separate histograms of life expectancy for each continent
df %>%
filter(continent != "Oceania") %>%
ggplot(aes(x = lifeExp)) +
geom_histogram() +
facet_wrap(~ continent)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Generate a scatterplot of the relationship between per capita GDP and life expectancy
ggplot(df, aes(x = gdpPercap, y = lifeExp)) +
geom_point()
ggplot(df,
mapping = aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Use faceting to identify differences
df %>%
filter(continent != "Oceania") %>%
ggplot( aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
geom_smooth() +
facet_wrap(~ continent)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
library(viridis)
## Warning: package 'viridis' was built under R version 3.6.3
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
# Show a bubbleplot
df %>%
mutate(pop=pop/1000000) %>%
arrange(desc(pop)) %>%
mutate(country = factor(country)) %>%
ggplot(aes(x=gdpPercap,
y=lifeExp,
size = pop,
color = continent)) +
geom_point(alpha = 0.7) +
scale_size(range = c(1.4, 19), name = "Population (M)") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme(legend.position="bottom")
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.6.3
data(gapminder)
Change over time
df <- filter(gapminder, country %in% c("Canada", "Cambodia")) %>%
mutate(year = as.Date(paste(year, "-01-01", sep = "", format='%Y-%b-%d')))
ggplot(df, aes(x = year, y = lifeExp, colour = country)) +
geom_line(size = 1) +
geom_point(size = 2) +
scale_colour_manual(values = c("Canada" = "blue", "Cambodia" = "red")) +
scale_x_date(breaks = df$year, date_labels = "%Y") +
scale_y_continuous(limits = c(0, NA), labels = scales::comma) +
labs(title = "",
subtitle = "Life expectancy in Canada and Cambodia, 1952-2007",
caption = "Source: Gapminder.org | @traffordDataLab",
x = "",
y = "Age (years)",
colour = NULL) +
theme(panel.grid.major.x = element_blank(),
legend.position = "bottom")
Slope chart
df <- filter(gapminder, country %in% c("Canada", "Cambodia") & year %in% c(1952, 2007))
ggplot(df) +
geom_line(aes(x = as.factor(year), y = gdpPercap, group = country, colour = country), size = 2, alpha = 0.8) +
geom_point(aes(x = as.factor(year), y = gdpPercap, group = country, colour = country), size = 5, alpha = 0.8) +
geom_text(data = subset(df, year == 1952),
aes(x = as.factor(year), y = gdpPercap, colour = country,
label = paste(country, scales::dollar(round(gdpPercap, 0)), sep = ", "),
size = 4, hjust = 1.2)) +
geom_text(data = subset(df, year == 2007),
aes(x = as.factor(year), y = gdpPercap, colour = country, label = scales::dollar(round(gdpPercap, 0))),
size = 4, hjust = -0.3) +
scale_colour_brewer(palette = "Set2") +
labs(title = "Change in GDP per from 1952 to 2007",
x = NULL,
y = NULL,
colour = NULL) +
theme(panel.grid.major = element_blank(),
axis.text.y = element_blank(),
legend.position = "none")
df2 <- gapminder %>%
filter(country %in% c("France", "Germany", "Ireland", "Italy")) %>%
mutate(year = as.Date(paste(year, "-01-01", sep = "", format='%Y-%b-%d')))
df2
## # A tibble: 48 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <date> <dbl> <int> <dbl>
## 1 France Europe 1952-01-01 67.4 42459667 7030.
## 2 France Europe 1957-01-01 68.9 44310863 8663.
## 3 France Europe 1962-01-01 70.5 47124000 10560.
## 4 France Europe 1967-01-01 71.6 49569000 13000.
## 5 France Europe 1972-01-01 72.4 51732000 16107.
## 6 France Europe 1977-01-01 73.8 53165019 18293.
## 7 France Europe 1982-01-01 74.9 54433565 20294.
## 8 France Europe 1987-01-01 76.3 55630100 22066.
## 9 France Europe 1992-01-01 77.5 57374179 24704.
## 10 France Europe 1997-01-01 78.6 58623428 25890.
## # ... with 38 more rows
Correlation:Scatterplot
ggplot(filter(gapminder, year == 2007), aes(x = gdpPercap, y = lifeExp)) +
scale_x_log10(labels = scales::dollar) +
geom_point(aes(size = pop, fill = continent), shape = 21, colour = "white", alpha = 0.9) +
scale_fill_brewer(palette = "Set2") +
scale_size_continuous(range = c(1, 20)) +
labs(title = "Relationship between life expectancy and income, 2007",
x = "GDP per capita ($)",
y = "Life expectency (years)") +
guides(size = FALSE) +
theme(panel.grid.major.x = element_blank(),
legend.position = "right",
legend.title = element_blank())
In Europe, which countries have GDP above the median (in 2007) ?
df <- gapminder %>%
filter(year == 2007 & continent == "Europe") %>%
mutate(median = median(gdpPercap),
diff = gdpPercap - median,
type = ifelse(gdpPercap < median, "Below", "Above")) %>%
arrange(diff) %>%
mutate(country = factor(country, levels = country))
df
## # A tibble: 30 x 9
## country continent year lifeExp pop gdpPercap median diff type
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl> <dbl> <chr>
## 1 Albania Europe 2007 76.4 3.60e6 5937. 28054. -22117. Below
## 2 Bosnia and Her~ Europe 2007 74.9 4.55e6 7446. 28054. -20608. Below
## 3 Turkey Europe 2007 71.8 7.12e7 8458. 28054. -19596. Below
## 4 Montenegro Europe 2007 74.5 6.85e5 9254. 28054. -18800. Below
## 5 Serbia Europe 2007 74.0 1.02e7 9787. 28054. -18268. Below
## 6 Bulgaria Europe 2007 73.0 7.32e6 10681. 28054. -17373. Below
## 7 Romania Europe 2007 72.5 2.23e7 10808. 28054. -17246. Below
## 8 Croatia Europe 2007 75.7 4.49e6 14619. 28054. -13435. Below
## 9 Poland Europe 2007 75.6 3.85e7 15390. 28054. -12664. Below
## 10 Hungary Europe 2007 73.3 9.96e6 18009. 28054. -10045. Below
## # ... with 20 more rows
ggplot(df, aes(x = country, y = diff, label = country)) +
geom_col(aes(fill = type), width = 0.5, alpha = 0.8) +
scale_y_continuous(expand = c(0, 0),
labels = scales::dollar) +
scale_fill_manual(labels = c("Above median", "Below median"),
values = c("Above" = "purple", "Below" = "blue")) +
labs(title = "GDP per capita, 2007",
x = NULL,
y = NULL,
fill = NULL) +
coord_flip() +
theme(panel.grid.major.y = element_blank())
Distribution: Density plot
ggplot(filter(gapminder, year == 2007 & continent != "Oceania"), aes(x = lifeExp)) +
geom_density(aes(fill = continent), size = 0.1, alpha = 0.5) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Life expectancy distribution in 2007",
x = "Age (years)",
y = "",
fill = NULL) +
theme(panel.grid.major.x = element_blank())
Boxplot
ggplot(filter(gapminder, year == 2007), aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot(colour = "#757575", alpha = 0.8) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Life expectancy distributions, 2007",
x = "",
y = "Age (years)") +
theme(panel.grid.major.x = element_blank(),
legend.position = "none")
Ridgeline plot
library(ggridges)
df <- gapminder %>% filter(year == 2007 & continent != "Oceania")
ggplot(df, aes(x = lifeExp, y = fct_rev(continent), fill = continent)) +
geom_density_ridges(colour = "#bdbdbd", size = 0.5, alpha = 0.5) +
scale_x_continuous(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Life expectancy distribution, 2007",
x = "Life Expectency (years)",
y = "") +
theme(panel.grid.major.x = element_blank(),
legend.position = "none")
## Picking joint bandwidth of 2.48
Magnitude
Bar chart (vertical)
df <- gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(median = median(gdpPercap))
ggplot(df, aes(x = continent, y = median, fill = continent)) +
geom_col(alpha = 0.8) +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(labels = scales::dollar, expand = c(0, 0)) +
labs(title = "",
subtitle = "Median GDP per capita by continent, 2007",
caption = "Source: Gapminder.org | @traffordDataLab",
x = NULL,
y = "GDP per capita",
fill = NULL) +
theme(panel.grid.major.x = element_blank(),
legend.position = "none")
Grouped bar chart
df <- gapminder %>%
filter(year > 1990) %>%
group_by(year, continent) %>%
summarise(totalpop = sum(as.double(pop)))
ggplot(df, aes(x = year, y = totalpop, group = continent, fill = continent)) +
geom_col(position = "dodge", colour = "#757575", size = 0.2, alpha = 0.8) +
scale_x_continuous(breaks = seq(1992, 2007, 5), expand = c(0, 0)) +
scale_y_continuous(labels = scales::comma, expand = c(0, 0)) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Total population by continent, 1990-2007",
x = NULL,
y = NULL,
fill = NULL) +
theme(panel.grid.major.x = element_blank(),
legend.position = "bottom")
Stacked bar chart
Part-to-whole
100% stacked bar chart
df <- gapminder %>%
filter(year > 1990) %>%
group_by(year, continent) %>%
summarise(totalpop = sum(as.double(pop)))
ggplot(df, aes(x = year, y = totalpop, fill = continent)) +
geom_col(position = "fill", colour = "#757575", size = 0.2, alpha = 0.8) +
scale_x_continuous(breaks = seq(1992, 2007, 5), expand = c(0, 0)) +
scale_y_continuous(labels = scales::percent, expand = c(0, 0)) +
scale_fill_brewer(palette = "Set2") +
guides(fill = guide_legend(reverse = T)) +
labs(title = "Proportion of total population by continent, 1990-2007",
x = NULL,
y = NULL,
fill = NULL) +
theme(panel.grid.major.x = element_blank(),
legend.position = "right")
Treemap
library(treemapify)
df <- gapminder %>%
filter(year == 2007 & continent != "Oceania") %>%
mutate(gdp = pop * gdpPercap)
ggplot(df, aes(area = gdp, fill = continent, subgroup = continent, label = country)) +
geom_treemap() +
geom_treemap_subgroup_border(colour = "black") +
geom_treemap_subgroup_text(fontface = "bold", colour = "#f0f0f0", alpha = 0.7, place = "bottomleft") +
geom_treemap_text(colour = "white", place = "centre", reflow = TRUE) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Country GDP by continent, 2007",
x = NULL,
y = NULL,
fill = NULL) +
theme(legend.position = "none")
Ranking Ordered bar chart (horizontal)
df <- gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarise(median = median(gdpPercap))
ggplot(df, aes(reorder(continent, -median,sum), median)) +
geom_col(fill = "#fc6721", alpha = 0.8) +
scale_y_continuous(expand = c(0, 0), labels = scales::dollar) +
coord_flip() +
labs(title = "Median GDP per capita by continent, 2007",
x = NULL,
y = "GDP per capita",
fill = NULL) +
theme(panel.grid.major.y = element_blank())
Lollipop chart
df <- gapminder %>%
filter(year == 2007 & continent == "Americas") %>%
arrange(gdpPercap) %>%
mutate(country = factor(country, levels = country))
ggplot(df, aes(x = gdpPercap, y = country)) +
geom_segment(aes(x = 0, xend = gdpPercap,
y = country, yend = country),
colour = "purple") +
geom_point(colour = "blue", size = 5, alpha = 0.8) +
scale_x_continuous(expand = c(0, 0),
limits = c(0, max(df$gdpPercap) * 1.1),
labels = scales::dollar) +
labs(title = "",
subtitle = "GDP per capita in American countries, 2007",
x = NULL,
y = NULL,
fill = NULL) +
theme(panel.grid.major = element_blank(),
axis.text.y = element_text(hjust = 0))