Summary
This short analysis is done for a self-training purpose. The dataset was found on Gapminder. The data origin is from the WHO.
Data source: https://www.gapminder.org/data/
Data file: https://docs.google.com/spreadsheets/d/1UTd3D_D89fViE_ZN5wnIiemv2dyhtTS-92qIv_lJhcI/pub?gid=0
The indicator represents the recorded and estimated average alcohol consumption, adult (15+) per capita consumption in litres pure alcohol.
library(ggplot2)
library(dplyr)
library(tidyr)
library(countrycode)
library(maps)
library(forcats)data.source <- read.csv("C:/Users/marc/Desktop/Data/160905_alcohol/indicator_alcohol_consumption_20100830.csv", sep = ";", header= TRUE, stringsAsFactors = FALSE)
data.source <- tbl_df(data.source)Data cleaning
We will have a look at 2008 data only so we can remove the rest.
colnames(data.source)## [1] "X" "X1985" "X1986" "X1987" "X1988" "X1989" "X1990" "X1991"
## [9] "X1992" "X1993" "X1994" "X1995" "X1996" "X1997" "X1998" "X1999"
## [17] "X2000" "X2001" "X2002" "X2003" "X2004" "X2005" "X2006" "X2007"
## [25] "X2008"
data.cleaned <- select(data.source, X, X2008)
colnames(data.cleaned) <- c("region", "alcohol.consumption")Prepare data for the chart
We need to map countries from the dataset with their related longitude/latitude coordinates. We will use the map_data() fonction for this.
world <- map_data("world")
check <- data.cleaned$region %in% world$region
#Few countries don't match.. so we are going to adjust their names manually
data.cleaned$region[check != TRUE]## [1] "Antigua and Barbuda" "Congo [DRC]"
## [3] "Congo [Republic]" "Côte d'Ivoire"
## [5] "Macedonia [FYROM]" "Myanmar [Burma]"
## [7] "Saint Kitts and Nevis" "Saint Vincent and the Grenadines"
## [9] "São Tomé and Príncipe" "Trinidad and Tobago"
## [11] "Tuvalu" "United Kingdom"
## [13] "United States"
data.cleaned$region <- as.factor(data.cleaned$region)
#1. coutries we can replace
data.cleaned$region <- fct_recode(data.cleaned$region,
`USA` = "United States",
`Republic of Congo` = "Congo [Republic]",
`Democratic Republic of the Congo` = "Congo [DRC]",
`Ivory Coast` = "Côte d'Ivoire",
`Macedonia` = "Macedonia [FYROM]",
`Myanmar` = "Myanmar [Burma]",
`Sao Tome and Principe` = "São Tomé and Príncipe",
`UK` = "United Kingdom")
#2. country to remove: I couldn't find Tuvalu in the country list
data.cleaned <- filter(data.cleaned, region != "Tuvalu")
#3. countries to remove. Some countries such as "Antigua and Barbuda" are considered as two seperate countries/regions in the world$region (in this case "Antigua" and "Barbada"). As I don't have a more detailed data, I will remove them.
data.cleaned <- filter(data.cleaned, region != "Antigua and Barbuda",
region != "Trinidad and Tobago",
region != "Saint Vincent and the Grenadines",
region != "Saint Kitts and Nevis")Chart
data.chart <- merge(world, data.cleaned, by= "region", sort = FALSE)
data.chart <- arrange(data.chart, order)
#palette was found on http://www.colorhunt.co/
my.palette <- c("#94ED88", "#3BB873", "#406661", "#384137")ggplot(data.chart, aes(long, lat)) +
geom_polygon(aes(group = group, fill = alcohol.consumption)) +
theme_classic() +
theme(panel.border = element_blank(),
axis.text = element_blank(),
line = element_blank(),
axis.title = element_blank())+
coord_map(xlim=c(-180,180))+
scale_fill_gradientn(colours = my.palette, breaks = c(5, 10, 15, 20))+
ggtitle("Alcohol consumption per adult (15+) in 2008 in litres")