Getting data from GapMinder.org Revised

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, echo = TRUE, 
    tidy = FALSE, fig.width = 7, fig.height = 7)
options(width = 116, scipen = 10)

References

Use gapminder() function

## Load datamart package
library(datamart)

## Establish connection
gm <- gapminder()

## Define a wrapper function for typical datasets
gapminder2df <- function(gm, data.name) {
    require(reshape2)

    data.object           <- query(gm, data.name)
    df.data.object        <- data.frame(data.object)
    names(df.data.object) <- names(data.object)
    df.data.object$date   <- as.Date(rownames(df.data.object))

    df.data.object.melt   <-
        melt(data          = df.data.object,
             id.var        = c("date"),
             variable.name = "country",
             value.name    = paste(data.name))

    df.data.object.melt
}

## Query for data available
queries(gm)
 [1] "Population"                         "MainReligion"                       "TotalFertilityRate"                
 [4] "PerCapitaCO2Emissions"              "IncomePerCapita"                    "InfantMortalityRate"               
 [7] "LifeExpectancyAtBirth"              "AdolescentFertilityRate"            "BirthsAttendedBySkilledHealthStaff"
[10] "ContraceptiveUse"                   "CrudeBirthRate"                     "MaternalMortalityRate"             
[13] "Under5MortalityRate"                "CrudeDeathRate"                     "PopulationGrowth"                  
[16] "SugarConsumption"                   "GDP"                                "ConsumerPricesIndex"               
[19] "GDPImplicitDeflator"                "CoalConsumption"                    "HydroelectricityConsumption"       
[22] "NaturalGasConsumption"              "NuclearConsumption"                 "OilConsumption"                    
[25] "CoalProduction"                     "ElectricityGeneration"              "NaturalGasProduction"              
[28] "OilProduction"                      "PrimaryEnergyConsumption"           "CO2Emissions"                      
[31] "SulfurEmissions"                    "TotalForestArea"                    "PrimaryForestArea"                 
[34] "PlantedForestArea"                  "WoodRemoval"                        "BiomassStockInForest"              
[37] "TotalWaterWithdrawal"               "SurfaceArea"                        "BadTeethPerChild"                  
[40] "PeopleLivingWithHIV"                "MalariaReportedCases"               "MalariaReportedDeaths"             
[43] "WorkingHoursPerWeek"                "UrbanPopulation"                    "WomensAgeAtFirstMarriage"          
[46] "NumberOfBillionaires"               "GiniIndex"                          "BroadbandSubscribers"              
[49] "CellPhones"                         "PersonalComputers"                  "PatentApplications"                
[52] "PatentsGranted"                     "PatentsInForce"                     "ArmsExports"                       
[55] "ArmsImports"                        "HumanDevelopmentIndex"             

Obtain data

## These share a common structure: time ~ country (values in cells)
TotalFertilityRate <- gapminder2df(gm, "TotalFertilityRate")
IncomePerCapita    <- gapminder2df(gm, "IncomePerCapita")
Population         <- gapminder2df(gm, "Population")

## MainReligion has only two columns: Entity and Group
MainReligion        <- data.frame(query(gm, "MainReligion"))
names(MainReligion) <- c("country","MainReligion")
MainReligion[MainReligion$MainReligion == "", "MainReligion"] <- "other"
MainReligion$MainReligion <- factor(MainReligion$MainReligion)

Use Reduce for merging multiple data frames

## Reference
## http://stackoverflow.com/questions/8091303/merge-multiple-data-frames-in-a-list-simultaneously
## left reduce computes l_1 = f(v_1, v_2), l_2 = f(l_1, v_3), etc., and returns l_{n-1} = f(l_{n-2}, v_n)
list.of.data.frames <- list(TotalFertilityRate = TotalFertilityRate,
                            IncomePerCapita    = IncomePerCapita,
                            Population         = Population)

df.merged <- Reduce(function(...) merge(..., all = TRUE), list.of.data.frames)

## Left joint: df.merged <- MainReligion
df.merged <- merge(x = df.merged, y = MainReligion,
                   all.x = TRUE, all.y = FALSE)

## Use only complete cases (inner join is another option)
df.merged.complete <- df.merged[complete.cases(df.merged),]

Mark countries of interest

countries.of.interestn <- c("Japan","United Kingdom","USSR","Russia","China","United States","India","Iran","Saudi Arabia","Qatar","Korea, Rep.","Korea, Dem. Rep.","Brunei","Kuwait","Norway","Luxembourg","United Arab Emirates","Korea, United","Iraq","Sweden")

df.merged.complete$count.int <- as.character("")
df.merged.complete$count.int[df.merged.complete$country %in% countries.of.interestn] <-
    as.character(df.merged.complete$country[df.merged.complete$country %in% countries.of.interestn])

Plotting all years with ggplot2:

The total fertility rate (babies born per woman) are lower for high income data points, except for some data points in the Muslim group.

## ggplot2
library(ggplot2)

ggplot(df.merged.complete) +
    geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country, size = Population)) +
    theme_bw() +
    scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
    facet_wrap(~ MainReligion, drop = FALSE) +
    opts(legend.position = "none")

plot of chunk unnamed-chunk-6

Plotting year 2008 with ggplot2

## Extract 2008 data
data.2008 <- subset(df.merged.complete, date == as.Date("2008-01-01"))

## Plot use black and white theme, 2x2 layout by facet_wrap(), alpha = 1/3 for overlap
ggplot(data.2008) +
    geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion),
               alpha = 1/3) +
    geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
              hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
    theme_bw() +
    facet_wrap(~ MainReligion)

plot of chunk unnamed-chunk-7

Plotting every ten year from 1901 to 2011

The total fertility rate (babies born per woman) decreased before these countries became rich in most cases, except for some muslim countries in 1981.

every.x.years <- as.Date(paste(seq(1901, 2011, by = 10), "-01-01", sep = ""))

## Use plyr::d_ply() for looping
library(plyr)
d_ply(.data = subset(df.merged.complete, date %in% every.x.years),
      .variables = "date",
      function(single.year) {

          gg.graph <-
              ggplot(single.year) +
                  geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country),
                             alpha = 2/3) +
                  geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
                            hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
                  theme_bw() +
                  scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
                  facet_wrap(~ MainReligion, drop = FALSE) +
                  opts(title = single.year[1,"date"], legend.position = "none")

          print(gg.graph)
      })

plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8 plot of chunk unnamed-chunk-8

You can animate it using the animation package. (Not possible on RPubs)

library(animation)

ani.start()

d_ply(.data = df.merged.complete,
      .variables = "date",
      function(single.year) {

          gg.graph <-
              ggplot(single.year) +
                  geom_point(aes(x = IncomePerCapita, y = TotalFertilityRate, group = MainReligion, color = country),
                             alpha = 2/3) +
                  geom_text(aes(x = IncomePerCapita, y = TotalFertilityRate, label = count.int),
                            hjust = -0.1, vjust = 0.5, angle = 45, size = 3) +
                  theme_bw() +
                  scale_x_continuous(limit = c(0,120000)) + scale_y_continuous(limit = c(0, 10)) +
                  facet_wrap(~ MainReligion, drop = FALSE) +
                  opts(title = single.year[1,"date"], legend.position = "none")

          print(gg.graph)
      })

ani.stop()