Getting data from GapMinder.org Revised

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, 
    tidy = FALSE, fig.width = 7, fig.height = 7)
options(width = 116, scipen = 10)

References

Use gapminder() function

## Load datamart package
library(datamart)

## Establish connection
gm <- gapminder()

## Define a wrapper function for typical datasets
gapminder2df <- function(gm, data.name) {
    require(reshape2)

    data.object           <- query(gm, data.name)
    df.data.object        <- data.frame(data.object)
    names(df.data.object) <- names(data.object)
    df.data.object$date   <- as.Date(rownames(df.data.object))

    df.data.object.melt   <-
        melt(data          = df.data.object,
             id.var        = c("date"),
             variable.name = "country",
             value.name    = paste(data.name))

    df.data.object.melt
}

## Query for data available
queries(gm)

 [1] "Population"                         "MainReligion"                       "TotalFertilityRate"                
 [4] "PerCapitaCO2Emissions"              "IncomePerCapita"                    "InfantMortalityRate"               
 [7] "LifeExpectancyAtBirth"              "AdolescentFertilityRate"            "BirthsAttendedBySkilledHealthStaff"
[10] "ContraceptiveUse"                   "CrudeBirthRate"                     "MaternalMortalityRate"             
[13] "Under5MortalityRate"                "CrudeDeathRate"                     "PopulationGrowth"                  
[16] "SugarConsumption"                   "GDP"                                "ConsumerPricesIndex"               
[19] "GDPImplicitDeflator"                "CoalConsumption"                    "HydroelectricityConsumption"       
[22] "NaturalGasConsumption"              "NuclearConsumption"                 "OilConsumption"                    
[25] "CoalProduction"                     "ElectricityGeneration"              "NaturalGasProduction"              
[28] "OilProduction"                      "PrimaryEnergyConsumption"           "CO2Emissions"                      
[31] "SulfurEmissions"                    "TotalForestArea"                    "PrimaryForestArea"                 
[34] "PlantedForestArea"                  "WoodRemoval"                        "BiomassStockInForest"              
[37] "TotalWaterWithdrawal"               "SurfaceArea"                        "BadTeethPerChild"                  
[40] "PeopleLivingWithHIV"                "MalariaReportedCases"               "MalariaReportedDeaths"             
[43] "WorkingHoursPerWeek"                "UrbanPopulation"                    "WomensAgeAtFirstMarriage"          
[46] "NumberOfBillionaires"               "GiniIndex"                          "BroadbandSubscribers"              
[49] "CellPhones"                         "PersonalComputers"                  "PatentApplications"                
[52] "PatentsGranted"                     "PatentsInForce"                     "ArmsExports"                       
[55] "ArmsImports"                        "HumanDevelopmentIndex"

Obtain data

## These share a common structure: time ~ country (values in cells)
TotalFertilityRate <- gapminder2df(gm, "TotalFertilityRate")
IncomePerCapita    <- gapminder2df(gm, "IncomePerCapita")
Population         <- gapminder2df(gm, "Population")

## MainReligion has only two columns: Entity and Group
MainReligion        <- data.frame(query(gm, "MainReligion"))
names(MainReligion) <- c("country","MainReligion")
MainReligion[MainReligion$MainReligion == "", "MainReligion"] <- "other"
MainReligion$MainReligion <- factor(MainReligion$MainReligion)

Use Reduce for merging multiple data frames

## Reference
## http://stackoverflow.com/questions/8091303/merge-multiple-data-frames-in-a-list-simultaneously
## left reduce computes l_1 = f(v_1, v_2), l_2 = f(l_1, v_3), etc., and returns l_{n-1} = f(l_{n-2}, v_n)
list.of.data.frames <- list(TotalFertilityRate = TotalFertilityRate,
                            IncomePerCapita    = IncomePerCapita,
                            Population         = Population)

df.merged <- Reduce(function(...) merge(..., all = TRUE), list.of.data.frames)

## Left joint: df.merged <- MainReligion
df.merged <- merge(x = df.merged, y = MainReligion,
                   all.x = TRUE, all.y = FALSE)

## Use only complete cases (inner join is another option)
df.merged.complete <- df.merged[complete.cases(df.merged),]

Mark countries of interest

countries.of.interestn <- c("Japan","United Kingdom","USSR","Russia","China","United States","India","Iran","Saudi Arabia","Qatar","Korea, Rep.","Korea, Dem. Rep.","Brunei","Kuwait","Norway","Luxembourg","United Arab Emirates","Korea, United","Iraq","Sweden")

df.merged.complete$count.int <- as.character("")
df.merged.complete$count.int[df.merged.complete$country %in% countries.of.interestn] <-
    as.character(df.merged.complete$country[df.merged.complete$country %in% countries.of.interestn])

Plotting all years with ggplot2:

x = IncomePerCapita,
y = TotalFertilityRate,
group = MainReligion,
color = country,
size = Population

The total fertility rate (babies born per woman) are lower for high income data points, except for some data points in the Muslim group.

## ggplot2
library(ggplot2)

ggplot(df.merged.complete) +
    geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country, size = Population)) +
    theme_bw() +
    scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
    facet_wrap(~ MainReligion, drop = FALSE) +
    opts(legend.position = "none")

plot of chunk unnamed-chunk-6

Plotting year 2008 with ggplot2

x = IncomePerCapita,
y = TotalFertilityRate,
group = MainReligion,
color = No need for one year data
size = Not set as additional information is minimal
label = count.int (countries of interest)

## Extract 2008 data
data.2008 <- subset(df.merged.complete, date == as.Date("2008-01-01"))

## Plot use black and white theme, 2x2 layout by facet_wrap(), alpha = 1/3 for overlap
ggplot(data.2008) +
    geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion),
               alpha = 1/3) +
    geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
              hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
    theme_bw() +
    facet_wrap(~ MainReligion)

plot of chunk unnamed-chunk-7

Plotting every ten year from 1901 to 2011

x = IncomePerCapita,
y = TotalFertilityRate,
group = MainReligion,
color = country,
label = count.int (countries of interest)

The total fertility rate (babies born per woman) decreased before these countries became rich in most cases, except for some muslim countries in 1981.

every.x.years <- as.Date(paste(seq(1901, 2011, by = 10), "-01-01", sep = ""))

## Use plyr::d_ply() for looping
library(plyr)
d_ply(.data = subset(df.merged.complete, date %in% every.x.years),
      .variables = "date",
      function(single.year) {

          gg.graph <-
              ggplot(single.year) +
                  geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country),
                             alpha = 2/3) +
                  geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
                            hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
                  theme_bw() +
                  scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
                  facet_wrap(~ MainReligion, drop = FALSE) +
                  opts(title = single.year[1,"date"], legend.position = "none")

          print(gg.graph)
      })

plot of chunk unnamed-chunk-8

You can animate it using the animation package. (Not possible on RPubs)

library(animation)

ani.start()

d_ply(.data = df.merged.complete,
      .variables = "date",
      function(single.year) {

          gg.graph <-
              ggplot(single.year) +
                  geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country),
                             alpha = 2/3) +
                  geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
                            hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
                  theme_bw() +
                  scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
                  facet_wrap(~ MainReligion, drop = FALSE) +
                  opts(title = single.year[1,"date"], legend.position = "none")

          print(gg.graph)
      })

ani.stop()