## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE,
tidy = FALSE, fig.width = 7, fig.height = 7)
options(width = 116, scipen = 10)
## Load datamart package
library(datamart)
## Establish connection
gm <- gapminder()
## Define a wrapper function for typical datasets
gapminder2df <- function(gm, data.name) {
require(reshape2)
data.object <- query(gm, data.name)
df.data.object <- data.frame(data.object)
names(df.data.object) <- names(data.object)
df.data.object$date <- as.Date(rownames(df.data.object))
df.data.object.melt <-
melt(data = df.data.object,
id.var = c("date"),
variable.name = "country",
value.name = paste(data.name))
df.data.object.melt
}
## Query for data available
queries(gm)
[1] "Population" "MainReligion" "TotalFertilityRate"
[4] "PerCapitaCO2Emissions" "IncomePerCapita" "InfantMortalityRate"
[7] "LifeExpectancyAtBirth" "AdolescentFertilityRate" "BirthsAttendedBySkilledHealthStaff"
[10] "ContraceptiveUse" "CrudeBirthRate" "MaternalMortalityRate"
[13] "Under5MortalityRate" "CrudeDeathRate" "PopulationGrowth"
[16] "SugarConsumption" "GDP" "ConsumerPricesIndex"
[19] "GDPImplicitDeflator" "CoalConsumption" "HydroelectricityConsumption"
[22] "NaturalGasConsumption" "NuclearConsumption" "OilConsumption"
[25] "CoalProduction" "ElectricityGeneration" "NaturalGasProduction"
[28] "OilProduction" "PrimaryEnergyConsumption" "CO2Emissions"
[31] "SulfurEmissions" "TotalForestArea" "PrimaryForestArea"
[34] "PlantedForestArea" "WoodRemoval" "BiomassStockInForest"
[37] "TotalWaterWithdrawal" "SurfaceArea" "BadTeethPerChild"
[40] "PeopleLivingWithHIV" "MalariaReportedCases" "MalariaReportedDeaths"
[43] "WorkingHoursPerWeek" "UrbanPopulation" "WomensAgeAtFirstMarriage"
[46] "NumberOfBillionaires" "GiniIndex" "BroadbandSubscribers"
[49] "CellPhones" "PersonalComputers" "PatentApplications"
[52] "PatentsGranted" "PatentsInForce" "ArmsExports"
[55] "ArmsImports" "HumanDevelopmentIndex"
## These share a common structure: time ~ country (values in cells)
TotalFertilityRate <- gapminder2df(gm, "TotalFertilityRate")
IncomePerCapita <- gapminder2df(gm, "IncomePerCapita")
Population <- gapminder2df(gm, "Population")
## MainReligion has only two columns: Entity and Group
MainReligion <- data.frame(query(gm, "MainReligion"))
names(MainReligion) <- c("country","MainReligion")
MainReligion[MainReligion$MainReligion == "", "MainReligion"] <- "other"
MainReligion$MainReligion <- factor(MainReligion$MainReligion)
## Reference
## http://stackoverflow.com/questions/8091303/merge-multiple-data-frames-in-a-list-simultaneously
## left reduce computes l_1 = f(v_1, v_2), l_2 = f(l_1, v_3), etc., and returns l_{n-1} = f(l_{n-2}, v_n)
list.of.data.frames <- list(TotalFertilityRate = TotalFertilityRate,
IncomePerCapita = IncomePerCapita,
Population = Population)
df.merged <- Reduce(function(...) merge(..., all = TRUE), list.of.data.frames)
## Left joint: df.merged <- MainReligion
df.merged <- merge(x = df.merged, y = MainReligion,
all.x = TRUE, all.y = FALSE)
## Use only complete cases (inner join is another option)
df.merged.complete <- df.merged[complete.cases(df.merged),]
countries.of.interestn <- c("Japan","United Kingdom","USSR","Russia","China","United States","India","Iran","Saudi Arabia","Qatar","Korea, Rep.","Korea, Dem. Rep.","Brunei","Kuwait","Norway","Luxembourg","United Arab Emirates","Korea, United","Iraq","Sweden")
df.merged.complete$count.int <- as.character("")
df.merged.complete$count.int[df.merged.complete$country %in% countries.of.interestn] <-
as.character(df.merged.complete$country[df.merged.complete$country %in% countries.of.interestn])
The total fertility rate (babies born per woman) are lower for high income data points, except for some data points in the Muslim group.
## ggplot2
library(ggplot2)
ggplot(df.merged.complete) +
geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country, size = Population)) +
theme_bw() +
scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
facet_wrap(~ MainReligion, drop = FALSE) +
opts(legend.position = "none")
## Extract 2008 data
data.2008 <- subset(df.merged.complete, date == as.Date("2008-01-01"))
## Plot use black and white theme, 2x2 layout by facet_wrap(), alpha = 1/3 for overlap
ggplot(data.2008) +
geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion),
alpha = 1/3) +
geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
theme_bw() +
facet_wrap(~ MainReligion)
The total fertility rate (babies born per woman) decreased before these countries became rich in most cases, except for some muslim countries in 1981.
every.x.years <- as.Date(paste(seq(1901, 2011, by = 10), "-01-01", sep = ""))
## Use plyr::d_ply() for looping
library(plyr)
d_ply(.data = subset(df.merged.complete, date %in% every.x.years),
.variables = "date",
function(single.year) {
gg.graph <-
ggplot(single.year) +
geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country),
alpha = 2/3) +
geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
theme_bw() +
scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
facet_wrap(~ MainReligion, drop = FALSE) +
opts(title = single.year[1,"date"], legend.position = "none")
print(gg.graph)
})
library(animation)
ani.start()
d_ply(.data = df.merged.complete,
.variables = "date",
function(single.year) {
gg.graph <-
ggplot(single.year) +
geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country),
alpha = 2/3) +
geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
theme_bw() +
scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
facet_wrap(~ MainReligion, drop = FALSE) +
opts(title = single.year[1,"date"], legend.position = "none")
print(gg.graph)
})
ani.stop()