Install dependencies
# install.packages('plyr', dependencies = TRUE)
library(plyr)
# install.packages('xtable', dependencies = TRUE)
library(xtable)
Load the data
gdURL <- "http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderDataFiveYear.txt"
gDat <- read.table(gdURL, header = TRUE, sep = "\t", quote = "\"")
Check the data is cleaned and ready to roll
str(gDat)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
tail(gDat)
## country year pop continent lifeExp gdpPercap
## 1699 Zimbabwe 1982 7636524 Africa 60.36 788.9
## 1700 Zimbabwe 1987 9216418 Africa 62.35 706.2
## 1701 Zimbabwe 1992 10704340 Africa 60.38 693.4
## 1702 Zimbabwe 1997 11404948 Africa 46.81 792.4
## 1703 Zimbabwe 2002 11926563 Africa 39.99 672.0
## 1704 Zimbabwe 2007 12311143 Africa 43.49 469.7
Let's quantitatively look at the weath of nations. We simply break the data set by continent, and get the max and min gdp and their ratio
gdpByContinent <- ddply(gDat, ~continent, summarize, minGdpPercap = min(gdpPercap),
maxGdpPercap = max(gdpPercap), richVsPoor = max(gdpPercap)/min(gdpPercap)) # round is supposed to give nice numbers back!
gdpByContinent <- arrange(gdpByContinent, richVsPoor)
gdpByContinent <- xtable(gdpByContinent, digits = 0) # digits truncates output
print(gdpByContinent, type = "html", include.rownames = FALSE)
| continent | minGdpPercap | maxGdpPercap | richVsPoor |
|---|---|---|---|
| Oceania | 10040 | 34435 | 3 |
| Americas | 1202 | 42952 | 36 |
| Europe | 974 | 49357 | 51 |
| Africa | 241 | 21951 | 91 |
| Asia | 331 | 113523 | 343 |
Here I take “wealth distrubution” to be the fold difference between the max and min gdpPercap
Yes, it is true, your eyes don't deceive you, the “richest” country is that much poorer that the “poorest” country
Asia has the largest “wealth distribution”, the gap between rich and poor
Let's look at the spread of the life expectancy There are various metrics
roundDec <- 1
lifeExpByCont <- ddply(gDat, ~continent, summarize, sdLifeExp = round(sd(lifeExp),
roundDec), madLifeExp = round(mad(lifeExp), roundDec), IQRLifeExp = round(IQR(lifeExp),
roundDec))
arrange(lifeExpByCont, sdLifeExp)
## continent sdLifeExp madLifeExp IQRLifeExp
## 1 Oceania 3.8 4.0 6.3
## 2 Europe 5.4 4.4 5.9
## 3 Africa 9.2 8.6 12.0
## 4 Americas 9.3 8.5 13.3
## 5 Asia 11.9 13.0 18.1
arrange(lifeExpByCont, madLifeExp)
## continent sdLifeExp madLifeExp IQRLifeExp
## 1 Oceania 3.8 4.0 6.3
## 2 Europe 5.4 4.4 5.9
## 3 Americas 9.3 8.5 13.3
## 4 Africa 9.2 8.6 12.0
## 5 Asia 11.9 13.0 18.1
arrange(lifeExpByCont, IQRLifeExp)
## continent sdLifeExp madLifeExp IQRLifeExp
## 1 Europe 5.4 4.4 5.9
## 2 Oceania 3.8 4.0 6.3
## 3 Africa 9.2 8.6 12.0
## 4 Americas 9.3 8.5 13.3
## 5 Asia 11.9 13.0 18.1
As you can see the results depend on the metric used
Asia always has the highest spread, but sometimes the lowest spread is Europe, sometimes Oceania
Take home lesson - always mention what you mean by “spread”
We compute the average life expencancy for each year over the whole data set
But we remove 5% of the max outliers and 5% of the min outliers, since the mean is sensitive to outliers
trimFrac <- 0.05 # this is about 7 maxs and 7 mins lopped off
lifeExpByYear <- ddply(gDat, ~year, summarize, avLifeExp = mean(lifeExp, trim = trimFrac))
lifeExpByYear <- xtable(lifeExpByYear, digit = 1)
print(lifeExpByYear, type = "html", include.rownames = FALSE)
| year | avLifeExp |
|---|---|
| 1952 | 48.8 |
| 1957 | 51.4 |
| 1962 | 53.6 |
| 1967 | 55.8 |
| 1972 | 57.9 |
| 1977 | 59.9 |
| 1982 | 61.9 |
| 1987 | 63.6 |
| 1992 | 64.8 |
| 1997 | 65.6 |
| 2002 | 66.2 |
| 2007 | 67.6 |
Imagine not making it to “middle age”“ (taken to be 40 years)
How many countries are there in each continent that have a life expectancy less thatn 40?
We feed in a subset of data with out middle age cut off right at the start
Let's keep the table ordered by continent and year
middleAge <- 40
middleAgeCount <- ddply(subset(gDat, subset = lifeExp < middleAge), ~continent +
year, summarize, countryCount = length(unique(country)))
middleAgeCount <- xtable(middleAgeCount)
print(middleAgeCount, type = "html", include.rownames = FALSE)
| continent | year | countryCount |
|---|---|---|
| Africa | 1952 | 29 |
| Africa | 1957 | 23 |
| Africa | 1962 | 15 |
| Africa | 1967 | 10 |
| Africa | 1972 | 6 |
| Africa | 1977 | 3 |
| Africa | 1982 | 3 |
| Africa | 1987 | 1 |
| Africa | 1992 | 3 |
| Africa | 1997 | 2 |
| Africa | 2002 | 2 |
| Africa | 2007 | 1 |
| Americas | 1952 | 1 |
| Asia | 1952 | 10 |
| Asia | 1957 | 5 |
| Asia | 1962 | 3 |
| Asia | 1967 | 2 |
| Asia | 1972 | 2 |
| Asia | 1977 | 2 |
| Asia | 1982 | 1 |
Now let's look at who has the most extreme life expectancy in a given year
We first write a funciton that gives you the answer to "what country has this life expencancy?”
getCountryWithLE <- function(lifeExpVal) return(gDat[which(gDat$lifeExp == lifeExpVal),
]$country)
lifeExpByYear <- ddply(gDat, ~year, summarize, minLifeExp = min(lifeExp), minCountry = getCountryWithLE(minLifeExp)[1],
maxLifeExp = max(lifeExp), maxCountry = getCountryWithLE(maxLifeExp)[1]) # because multiple countries can return we need to truncate minCountry and maxCountry, so there may be other additional countries
lifeExpByYear <- xtable(lifeExpByYear, digits = 0)
print(lifeExpByYear, type = "html", include.rownames = FALSE)
| year | minLifeExp | minCountry | maxLifeExp | maxCountry |
|---|---|---|---|---|
| 1952 | 29 | Afghanistan | 73 | Norway |
| 1957 | 30 | Afghanistan | 73 | Denmark |
| 1962 | 32 | Afghanistan | 74 | Croatia |
| 1967 | 34 | Afghanistan | 74 | Sweden |
| 1972 | 35 | Sierra Leone | 75 | Sweden |
| 1977 | 31 | Cambodia | 76 | Denmark |
| 1982 | 38 | Sierra Leone | 77 | Japan |
| 1987 | 40 | Angola | 79 | Germany |
| 1992 | 24 | Rwanda | 79 | Japan |
| 1997 | 36 | Rwanda | 81 | Japan |
| 2002 | 39 | Zambia | 82 | Japan |
| 2007 | 40 | Swaziland | 83 | Japan |