STAT 545A Homework 5

Yumian Hu

2013-10-06

Load the Gapminder data, necessary packages and drop Oceania.

In this homework, I continue to work with the Gapminder data to re-plot the figures in homework 4 with ggplot2 packages, and do a comparison between lattice and ggplot2.

> gDat <- read.delim("gapminderDataFiveYear.txt")
> str(gDat)
'data.frame':   1704 obs. of  6 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
 $ gdpPercap: num  779 821 853 836 740 ...
> library(ggplot2)
> library(lattice)
> library(plyr)
> library(xtable)
> library(gridExtra)
Loading required package: grid
> library(reshape2)

Drop Oceania since it contains only two contries

> jDat <- droplevels(subset(gDat, continent != "Oceania"))
> str(jDat)
'data.frame':   1680 obs. of  6 variables:
 $ country  : Factor w/ 140 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
 $ continent: Factor w/ 4 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
 $ gdpPercap: num  779 821 853 836 740 ...

How is life expectancy changing over time on different continents?

Calulate trimmed mean (trim = 0.2) of Life expectancy for each combination of continent & year.

1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
Europe 65 68 69 70 71 72 73 74 75 76 77 78
Americas 53 56 59 61 63 65 67 69 70 72 73 74
Asia 45 49 51 54 58 61 63 66 67 69 70 71
Africa 39 41 43 45 47 49 51 53 54 53 52 53

I will then display the trend of lifeExp over time on different continents, connected by year specific trimmed mean with both lattice and ggplot2.

> jTrim <- 0.2
> myTrimMean <- function(x) mean(x, trim = jTrim)
> latfig <- stripplot(lifeExp ~ factor(year) | continent, jDat, 
+                     jitter.data = TRUE, group=continent, layout=c(2,2),
+                     type = c("p", "a"), fun = myTrimMean, alpha = 0.6, grid = "h",
+                     main = list(label=paste("Life expectancy, trimmed mean, trim = ", jTrim), cex=0.9),
+                     scales = list(x = list(rot = c(45, 0))))
> 
> ggfig <- ggplot(jDat, aes(x = year, y = lifeExp, colour=continent)) + geom_point() + 
+   stat_summary(fun.y = myTrimMean, geom="line") + geom_jitter(position = position_jitter(width = .3)) + 
+   facet_wrap(~ continent) + ggtitle(paste("Life expectancy, trimmed mean, trim = ", jTrim)) + 
+   theme(plot.title = element_text(face="bold"))
> 
> grid.arrange(latfig, ggfig, ncol=2)

plot of chunk unnamed-chunk-7

Or plot different continents on the same panel.

> latfig <- stripplot(lifeExp ~ factor(year), jDat, jitter.data = TRUE,
+                     group = reorder(continent, lifeExp),
+                     type = c("p", "a"), fun = myTrimMean, alpha = 0.6, grid = "h",
+                     main = list(label=paste("Life expectancy, trimmed mean, trim = ", jTrim), cex=0.9),
+                     scales = list(x = list(rot = c(45, 0))),
+                     auto.key = list(reverse.rows = TRUE,
+                                     x = 0.05, y = 0.98, corner = c(0, 1)))
> 
> ggfig <- ggplot(jDat, aes(x = year, y = lifeExp, colour=continent)) + geom_point() + 
+   stat_summary(fun.y = myTrimMean, geom="line") + geom_jitter(position = position_jitter(width = .4)) + 
+   ggtitle(paste("Life expectancy, trimmed mean, trim = ", jTrim)) + 
+   theme(plot.title = element_text(face="bold"))
> 
> grid.arrange(latfig, ggfig, ncol=2)

plot of chunk unnamed-chunk-8

Look at the spread of GDP per capita within the continents.

First if we ignore year, the sd, iqr and mad of GdpPercap for each continent are shown in the table below.

Africa Americas Asia Europe
sdGdpPercap 2828 6397 14045 9355
iqrGdpPercap 1616 4402 7492 13248
madGdpPercap 775 3269 2821 8846

Then use both lattice and ggplot2 to show the measure of spread for each continent.

> foo <- ddply(jDat, ~ continent, summarize,
+              sdGdpPercap = sd(gdpPercap), iqrGdpPercap = IQR(gdpPercap),
+              madGdpPercap = mad(gdpPercap))
> latfig <- xyplot(sdGdpPercap + iqrGdpPercap + madGdpPercap ~ continent, foo,
+                  type = "b", ylab = "measure of spread",
+                  auto.key = list(x = 0.05, y = 0.95, corner = c(0, 1)))
> 
> foonew <- melt(foo, id="continent")
> ggfig <- ggplot(foonew, aes(x = continent, y = value, colour = variable)) + 
+   geom_point() + geom_line(aes(x=as.numeric(continent)))+ ylab("measure of spread") 
> 
> grid.arrange(latfig, ggfig, ncol=2)

plot of chunk unnamed-chunk-10

Then I account for year. Let's put measure of spread in different panels.

> foo <- ddply(jDat, ~ continent + year, summarize,
+              sdGdpPercap = sd(gdpPercap), iqrGdpPercap = IQR(gdpPercap),
+              madGdpPercap = mad(gdpPercap))
> ## Lattice
> xyplot(sdGdpPercap + iqrGdpPercap + madGdpPercap ~ year, foo,
+        group = reorder(continent, sdGdpPercap), layout = c(3, 1),
+        type = "b", ylab = "measure of spread",
+        auto.key = list(x = 0.35, y = 0.85, corner = c(0, 1),
+                        reverse.rows = TRUE))

plot of chunk unnamed-chunk-11

> ## ggplot2
> foonew <- melt(foo, id=c("continent", "year"))
> ggplot(foonew, aes(x = year, y = value, colour = continent)) + 
+   geom_point() + geom_line()+ ylab("measure of spread") + facet_wrap(~ variable, nrow=1)

plot of chunk unnamed-chunk-12

Depict the number and/or proportion of countries with low life expectancy over time by continent.

Low life expectancy is defined as 43 to be consistent with Jenny's setting. The table below showw the percentage of low life expectancy in each continent from year 1952 to 2007.

Africa Americas Asia Europe Oceania
1952 0.83 (43/52) 0.20 (5/25) 0.36 (12/33) 0.00 (0/30) 0.00 (0/2)
1957 0.65 (34/52) 0.08 (2/25) 0.33 (11/33) 0.00 (0/30) 0.00 (0/2)
1962 0.54 (28/52) 0.00 (0/25) 0.15 (5/33) 0.00 (0/30) 0.00 (0/2)
1967 0.38 (20/52) 0.00 (0/25) 0.09 (3/33) 0.00 (0/30) 0.00 (0/2)
1972 0.25 (13/52) 0.00 (0/25) 0.09 (3/33) 0.00 (0/30) 0.00 (0/2)
1977 0.19 (10/52) 0.00 (0/25) 0.06 (2/33) 0.00 (0/30) 0.00 (0/2)
1982 0.13 (7/52) 0.00 (0/25) 0.03 (1/33) 0.00 (0/30) 0.00 (0/2)
1987 0.08 (4/52) 0.00 (0/25) 0.03 (1/33) 0.00 (0/30) 0.00 (0/2)
1992 0.10 (5/52) 0.00 (0/25) 0.03 (1/33) 0.00 (0/30) 0.00 (0/2)
1997 0.12 (6/52) 0.00 (0/25) 0.03 (1/33) 0.00 (0/30) 0.00 (0/2)
2002 0.08 (4/52) 0.00 (0/25) 0.03 (1/33) 0.00 (0/30) 0.00 (0/2)
2007 0.12 (6/52) 0.00 (0/25) 0.00 (0/33) 0.00 (0/30) 0.00 (0/2)

Use both lattice and ggplot2 to show the results. LifeExp values are plotted with two different colours to distinguish between low life expectancy and normal ones.

> bMark <- 43 
> ## Lattice
> stripplot(lifeExp ~ factor(year) | reorder(continent, lifeExp), jDat,
+                     jitter.data = TRUE, alpha = 0.6, grid = "h",
+                     group = lifeExp <= bMark, layout=c(2,2),
+                     main = list(label=paste("Life expectancy <= ", bMark), cex=0.9),
+                     scales = list(x = list(rot = c(45, 0))))

plot of chunk unnamed-chunk-14

> ## ggplot2
> ggplot(jDat, aes(x = year, y = lifeExp, colour = lifeExp <= bMark)) + 
+   geom_jitter(position = position_jitter(width = .2)) + facet_wrap(~ continent) + 
+   ggtitle(paste("Life expectancy <= ", bMark)) + theme(plot.title = element_text(face="bold")) + 
+   scale_colour_discrete(name="",
+                         breaks=c("FALSE", "TRUE"),
+                         labels=c("LifeExp > 43", "LifeExp <= 43"))

plot of chunk unnamed-chunk-15

Play with Gapminder bubble chart using ggplot2.

In this part, I will try to re-produce the Gapminder bubble chart from last tutorial using ggplot2 package.

> gdURL <- "http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderWithColorsAndSorted.txt"
> kDat <- read.delim(file = gdURL, as.is = 7) 
> str(kDat)
'data.frame':   1704 obs. of  7 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 25 59 135 67 60 48 15 134 65 9 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 2 3 3 4 2 4 4 3 ...
 $ year     : int  1952 1952 1952 1952 1952 1952 1952 1952 1952 1952 ...
 $ pop      : num  5.56e+08 3.72e+08 1.58e+08 8.65e+07 8.21e+07 ...
 $ lifeExp  : num  44 37.4 68.4 63 37.5 ...
 $ gdpPercap: num  400 547 13990 3217 750 ...
 $ color    : chr  "#40004B" "#460552" "#A50026" "#611A6D" ...
> jYear <- c(1952, 2007)
> yDat <- subset(kDat, year %in% jYear)
> str(yDat)
'data.frame':   284 obs. of  7 variables:
 $ country  : Factor w/ 142 levels "Afghanistan",..: 25 59 135 67 60 48 15 134 65 9 ...
 $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 2 3 3 4 2 4 4 3 ...
 $ year     : int  1952 1952 1952 1952 1952 1952 1952 1952 1952 1952 ...
 $ pop      : num  5.56e+08 3.72e+08 1.58e+08 8.65e+07 8.21e+07 ...
 $ lifeExp  : num  44 37.4 68.4 63 37.5 ...
 $ gdpPercap: num  400 547 13990 3217 750 ...
 $ color    : chr  "#40004B" "#460552" "#A50026" "#611A6D" ...

Why not first review Jenny's lattice code and figure?

> jCexDivisor <- 1500                     
> jPch <- 21
> jDarkGray <- 'grey20'
> gdURL <- "http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderContinentColors.txt"
> continentColors <- read.delim(file = gdURL, as.is = 3) 
> continentKey <-
+   with(continentColors,
+        list(x = 0.95, y = 0.05, corner = c(1, 0),
+             text = list(as.character(continent)),
+             points = list(pch = jPch, col = jDarkGray, fill = color)))
> xyplot(lifeExp ~ gdpPercap | factor(year), yDat, aspect = 2/3,
+        grid = TRUE, scales = list(x = list(log = 10, equispaced.log = FALSE)),
+        cex = sqrt(yDat$pop/pi)/jCexDivisor, fill.color = yDat$color,
+        col = jDarkGray, key = continentKey,
+        panel = function(x, y, ..., cex, fill.color, subscripts) {
+          panel.xyplot(x, y, cex = cex[subscripts],
+                       pch = jPch, fill = fill.color[subscripts], ...)
+        })

plot of chunk unnamed-chunk-17

Below is my try with ggplot2.

> cols <- yDat$color
> names(cols) <- unlist(yDat$color)
> ggplot(yDat, aes(x=gdpPercap, y=lifeExp)) + scale_x_log10() + 
+   geom_point(aes(size = sqrt(pop/pi)/jCexDivisor, fill=color), 
+              color= jDarkGray, pch = jPch, show_guide = FALSE) +
+   scale_size_continuous(range=c(1,40)) +
+   scale_fill_manual(values = cols) + 
+   facet_wrap(~year, nrow=2)

plot of chunk unnamed-chunk-18

Compare lattice and ggplot2 codes for the Gapminder bubble chart