STAT545a-2013-hw3_Liu
Read data into R and install the packages we need.
getwd()
## [1] "C:/Users/Yan/Dropbox/Important_File/2013-2014Courses/545_R/exercise"
setwd("C:/Users/Yan/Dropbox/Important_File/2013-2014Courses/545_R/exercise")
# install.packages('plyr', dependencies = TRUE) install.packages('xtable',
# dependencies = TRUE) install.packages('reshape')
library(plyr)
library(lattice)
library(xtable)
gDat <- read.delim("gapminderDataFiveYear.txt")
str(gDat)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
(CapByCont <- ddply(gDat, ~continent, summarize, mingdpPercap = min(gdpPercap),
maxgdpPercap = max(gdpPercap)))
## continent mingdpPercap maxgdpPercap
## 1 Africa 241.2 21951
## 2 Americas 1201.6 42952
## 3 Asia 331.0 113523
## 4 Europe 973.5 49357
## 5 Oceania 10039.6 34435
htmlPrint <- function(x, ..., digits = 0, include.rownames = FALSE) {
print(xtable(x, digits = digits, ...), type = "html", include.rownames = include.rownames,
...)
}
foo <- ddply(gDat, ~continent, summarize, minGdpPercap = min(gdpPercap), maxGdpPercap = max(gdpPercap))
htmlPrint(arrange(foo, minGdpPercap))
| continent | minGdpPercap | maxGdpPercap |
|---|---|---|
| Africa | 241 | 21951 |
| Asia | 331 | 113523 |
| Europe | 974 | 49357 |
| Americas | 1202 | 42952 |
| Oceania | 10040 | 34435 |
library(reshape)
## Attaching package: 'reshape'
##
## The following object is masked from 'package:plyr':
##
## rename, round_any
(gdpByCont <- ddply(gDat, ~continent, summarize, mingdpPercap = min(gdpPercap),
maxgdpPercap = max(gdpPercap)))
## continent mingdpPercap maxgdpPercap
## 1 Africa 241.2 21951
## 2 Americas 1201.6 42952
## 3 Asia 331.0 113523
## 4 Europe 973.5 49357
## 5 Oceania 10039.6 34435
melted <- melt(gdpByCont, id = 1)
melted
## continent variable value
## 1 Africa mingdpPercap 241.2
## 2 Americas mingdpPercap 1201.6
## 3 Asia mingdpPercap 331.0
## 4 Europe mingdpPercap 973.5
## 5 Oceania mingdpPercap 10039.6
## 6 Africa maxgdpPercap 21951.2
## 7 Americas maxgdpPercap 42951.7
## 8 Asia maxgdpPercap 113523.1
## 9 Europe maxgdpPercap 49357.2
## 10 Oceania maxgdpPercap 34435.4
foo <- ddply(gDat, ~continent, function(x) {
gdpPercap <- range(x$gdpPercap)
return(data.frame(gdpPercap, stat = c("min", "max")))
})
htmlPrint(foo)
| continent | gdpPercap | stat |
|---|---|---|
| Africa | 241 | min |
| Africa | 21951 | max |
| Americas | 1202 | min |
| Americas | 42952 | max |
| Asia | 331 | min |
| Asia | 113523 | max |
| Europe | 974 | min |
| Europe | 49357 | max |
| Oceania | 10040 | min |
| Oceania | 34435 | max |
(VarCapByCont <- ddply(gDat, ~continent, summarize, sd = round(sd(lifeExp),
2), median = round(mad(lifeExp), 2), Quantile = round(IQR = round(IQR(lifeExp),
2))))
## continent sd median Quantile
## 1 Africa 9.15 8.58 12
## 2 Americas 9.35 8.47 13
## 3 Asia 11.86 13.00 18
## 4 Europe 5.43 4.43 6
## 5 Oceania 3.80 4.00 6
foo <- ddply(gDat, ~continent, summarize, sdGdpPercap = sd(gdpPercap), madGdpPercap = mad(gdpPercap),
iqrGdpPercap = IQR(gdpPercap))
htmlPrint(arrange(foo, sdGdpPercap))
| continent | sdGdpPercap | madGdpPercap | iqrGdpPercap |
|---|---|---|---|
| Africa | 2828 | 775 | 1616 |
| Oceania | 6359 | 6459 | 8072 |
| Americas | 6397 | 3269 | 4402 |
| Europe | 9355 | 8846 | 13248 |
| Asia | 14045 | 2821 | 7492 |
# spot check.
IQR(gDat$gdpPercap[gDat$continent == "Europe"])
[1] 13248
4.Descritpive statistics to describe the trimmed mean of lifeExp for different continents over years. The first is “tall” format and the second is “wide” format.
(MeanCapByCont <- ddply(gDat, .(continent, year), summarize, trimean = round(mean(lifeExp,
trim = 0.05), 2)))
## continent year trimean
## 1 Africa 1952 38.99
## 2 Africa 1957 41.02
## 3 Africa 1962 43.08
## 4 Africa 1967 45.12
## 5 Africa 1972 47.26
## 6 Africa 1977 49.42
## 7 Africa 1982 51.43
## 8 Africa 1987 53.20
## 9 Africa 1992 53.82
## 10 Africa 1997 53.42
## 11 Africa 2002 53.02
## 12 Africa 2007 54.54
## 13 Americas 1952 53.29
## 14 Americas 1957 56.02
## 15 Americas 1962 58.49
## 16 Americas 1967 60.57
## 17 Americas 1972 62.62
## 18 Americas 1977 64.59
## 19 Americas 1982 66.46
## 20 Americas 1987 68.34
## 21 Americas 1992 69.83
## 22 Americas 1997 71.46
## 23 Americas 2002 72.72
## 24 Americas 2007 73.85
## 25 Asia 1952 46.26
## 26 Asia 1957 49.33
## 27 Asia 1962 51.62
## 28 Asia 1967 54.79
## 29 Asia 1972 57.48
## 30 Asia 1977 60.02
## 31 Asia 1982 62.88
## 32 Asia 1987 65.18
## 33 Asia 1992 66.93
## 34 Asia 1997 68.46
## 35 Asia 2002 69.70
## 36 Asia 2007 71.21
## 37 Europe 1952 64.86
## 38 Europe 1957 67.13
## 39 Europe 1962 68.94
## 40 Europe 1967 70.13
## 41 Europe 1972 71.13
## 42 Europe 1977 72.23
## 43 Europe 1982 73.08
## 44 Europe 1987 73.88
## 45 Europe 1992 74.58
## 46 Europe 1997 75.60
## 47 Europe 2002 76.77
## 48 Europe 2007 77.71
## 49 Oceania 1952 69.25
## 50 Oceania 1957 70.30
## 51 Oceania 1962 71.09
## 52 Oceania 1967 71.31
## 53 Oceania 1972 71.91
## 54 Oceania 1977 72.85
## 55 Oceania 1982 74.29
## 56 Oceania 1987 75.32
## 57 Oceania 1992 76.94
## 58 Oceania 1997 78.19
## 59 Oceania 2002 79.74
## 60 Oceania 2007 80.72
casted <- cast(MeanCapByCont, continent ~ year, mean)
## Using trimean as value column. Use the value argument to cast to override
## this choice
casted
## continent 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997
## 1 Africa 38.99 41.02 43.08 45.12 47.26 49.42 51.43 53.20 53.82 53.42
## 2 Americas 53.29 56.02 58.49 60.57 62.62 64.59 66.46 68.34 69.83 71.46
## 3 Asia 46.26 49.33 51.62 54.79 57.48 60.02 62.88 65.18 66.93 68.46
## 4 Europe 64.86 67.13 68.94 70.13 71.13 72.23 73.08 73.88 74.58 75.60
## 5 Oceania 69.25 70.30 71.09 71.31 71.91 72.85 74.29 75.32 76.94 78.19
## 2002 2007
## 1 53.02 54.54
## 2 72.72 73.85
## 3 69.70 71.21
## 4 76.77 77.71
## 5 79.74 80.72
jTrim <- 0.2
foo <- ddply(gDat, ~year, summarize, tMean = mean(lifeExp, trim = jTrim))
htmlPrint(arrange(foo, tMean))
| year | tMean |
|---|---|
| 1952 | 48 |
| 1957 | 51 |
| 1962 | 53 |
| 1967 | 56 |
| 1972 | 58 |
| 1977 | 60 |
| 1982 | 62 |
| 1987 | 64 |
| 1992 | 66 |
| 1997 | 67 |
| 2002 | 68 |
| 2007 | 69 |
5.COntinue descritpive statistics to describe the trimmed mean of lifeExp for different continents over years: using daply to obtain “wide” table
(MeanCapByCont <- daply(gDat, .(continent, year), summarize, trimean = round(mean(lifeExp,
trim = 0.05), 2)))
## year
## continent 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997
## Africa 38.99 41.02 43.08 45.12 47.26 49.42 51.43 53.2 53.82 53.42
## Americas 53.29 56.02 58.49 60.57 62.62 64.59 66.46 68.34 69.83 71.46
## Asia 46.26 49.33 51.62 54.79 57.48 60.02 62.88 65.18 66.93 68.46
## Europe 64.86 67.13 68.94 70.13 71.13 72.23 73.08 73.88 74.58 75.6
## Oceania 69.25 70.3 71.09 71.31 71.91 72.85 74.29 75.32 76.94 78.19
## year
## continent 2002 2007
## Africa 53.02 54.54
## Americas 72.72 73.85
## Asia 69.7 71.21
## Europe 76.77 77.71
## Oceania 79.74 80.72
htmlPrint(ddply(gDat, ~continent + year, summarize, medLifeExp = median(lifeExp)))
| continent | year | medLifeExp |
|---|---|---|
| Africa | 1952 | 39 |
| Africa | 1957 | 41 |
| Africa | 1962 | 43 |
| Africa | 1967 | 45 |
| Africa | 1972 | 47 |
| Africa | 1977 | 49 |
| Africa | 1982 | 51 |
| Africa | 1987 | 52 |
| Africa | 1992 | 52 |
| Africa | 1997 | 53 |
| Africa | 2002 | 51 |
| Africa | 2007 | 53 |
| Americas | 1952 | 55 |
| Americas | 1957 | 56 |
| Americas | 1962 | 58 |
| Americas | 1967 | 61 |
| Americas | 1972 | 63 |
| Americas | 1977 | 66 |
| Americas | 1982 | 67 |
| Americas | 1987 | 69 |
| Americas | 1992 | 70 |
| Americas | 1997 | 72 |
| Americas | 2002 | 72 |
| Americas | 2007 | 73 |
| Asia | 1952 | 45 |
| Asia | 1957 | 48 |
| Asia | 1962 | 49 |
| Asia | 1967 | 54 |
| Asia | 1972 | 57 |
| Asia | 1977 | 61 |
| Asia | 1982 | 64 |
| Asia | 1987 | 66 |
| Asia | 1992 | 69 |
| Asia | 1997 | 70 |
| Asia | 2002 | 71 |
| Asia | 2007 | 72 |
| Europe | 1952 | 66 |
| Europe | 1957 | 68 |
| Europe | 1962 | 70 |
| Europe | 1967 | 71 |
| Europe | 1972 | 71 |
| Europe | 1977 | 72 |
| Europe | 1982 | 73 |
| Europe | 1987 | 75 |
| Europe | 1992 | 75 |
| Europe | 1997 | 76 |
| Europe | 2002 | 78 |
| Europe | 2007 | 79 |
| Oceania | 1952 | 69 |
| Oceania | 1957 | 70 |
| Oceania | 1962 | 71 |
| Oceania | 1967 | 71 |
| Oceania | 1972 | 72 |
| Oceania | 1977 | 73 |
| Oceania | 1982 | 74 |
| Oceania | 1987 | 75 |
| Oceania | 1992 | 77 |
| Oceania | 1997 | 78 |
| Oceania | 2002 | 80 |
| Oceania | 2007 | 81 |
foo <- daply(gDat, ~year + continent, summarize, medLifeExp = median(lifeExp))
head(foo)
continent
year Africa Americas Asia Europe Oceania
1952 38.83 54.74 44.87 65.9 69.25
1957 40.59 56.07 48.28 67.65 70.3
1962 42.63 58.3 49.33 69.53 71.09
1967 44.7 60.52 53.66 70.61 71.31
1972 47.03 63.44 56.95 70.89 71.91
1977 49.27 66.35 60.77 72.34 72.85
foo <- as.data.frame(foo)
# str(foo) ## there's still some goofiness, ie the variables are lists ....
# hmmmm
htmlPrint(foo, include.rownames = TRUE)
| Africa | Americas | Asia | Europe | Oceania | |
|---|---|---|---|---|---|
| 1952 | 39 | 55 | 45 | 66 | 69 |
| 1957 | 41 | 56 | 48 | 68 | 70 |
| 1962 | 43 | 58 | 49 | 70 | 71 |
| 1967 | 45 | 61 | 54 | 71 | 71 |
| 1972 | 47 | 63 | 57 | 71 | 72 |
| 1977 | 49 | 66 | 61 | 72 | 73 |
| 1982 | 51 | 67 | 64 | 73 | 74 |
| 1987 | 52 | 69 | 66 | 75 | 75 |
| 1992 | 52 | 70 | 69 | 75 | 77 |
| 1997 | 53 | 72 | 70 | 76 | 78 |
| 2002 | 51 | 72 | 71 | 78 | 80 |
| 2007 | 53 | 73 | 72 | 79 | 81 |
# switch continent and year
foo <- daply(gDat, ~continent + year, summarize, medLifeExp = median(lifeExp))
htmlPrint(as.data.frame(foo), include.rownames = TRUE)
| 1952 | 1957 | 1962 | 1967 | 1972 | 1977 | 1982 | 1987 | 1992 | 1997 | 2002 | 2007 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Africa | 39 | 41 | 43 | 45 | 47 | 49 | 51 | 52 | 52 | 53 | 51 | 53 |
| Americas | 55 | 56 | 58 | 61 | 63 | 66 | 67 | 69 | 70 | 72 | 72 | 73 |
| Asia | 45 | 48 | 49 | 54 | 57 | 61 | 64 | 66 | 69 | 70 | 71 | 72 |
| Europe | 66 | 68 | 70 | 71 | 71 | 72 | 73 | 75 | 75 | 76 | 78 | 79 |
| Oceania | 69 | 70 | 71 | 71 | 72 | 73 | 74 | 75 | 77 | 78 | 80 | 81 |
6.Count the number of countries with low life expectancy over time by continent. “Tall” format.
lowlifeExp <- rep(0, nrow(gDat))
gDat <- cbind(gDat, lowlifeExp)
mdn <- median(gDat$lifeExp)
mdn
## [1] 60.71
gDat$lowlifeExp[gDat$lifeExp < mdn] <- 1
table(gDat$lowlifeExp)
##
## 0 1
## 852 852
(countCapByCont <- daply(gDat, .(continent, year), summarize, lowlifefreq = length(which(lowlifeExp ==
1))))
## year
## continent 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
## Africa 52 52 52 51 50 50 46 41 40 44 41 41
## Americas 19 16 13 13 10 7 5 2 2 1 1 0
## Asia 30 27 26 25 20 16 12 10 8 7 5 3
## Europe 7 3 1 1 1 1 0 0 0 0 0 0
## Oceania 0 0 0 0 0 0 0 0 0 0 0 0
(bMark <- quantile(gDat$lifeExp, 0.2))
20% 45.9
# bMark <- 43 # JB wrote this on her 43rd b'day!
htmlPrint(ddply(gDat, ~continent + year, function(x) c(lowLifeExp = sum(x$lifeExp <=
bMark))))
| continent | year | lowLifeExp |
|---|---|---|
| Africa | 1952 | 47 |
| Africa | 1957 | 45 |
| Africa | 1962 | 35 |
| Africa | 1967 | 29 |
| Africa | 1972 | 24 |
| Africa | 1977 | 16 |
| Africa | 1982 | 15 |
| Africa | 1987 | 9 |
| Africa | 1992 | 9 |
| Africa | 1997 | 10 |
| Africa | 2002 | 13 |
| Africa | 2007 | 9 |
| Americas | 1952 | 7 |
| Americas | 1957 | 5 |
| Americas | 1962 | 2 |
| Americas | 1967 | 1 |
| Americas | 1972 | 0 |
| Americas | 1977 | 0 |
| Americas | 1982 | 0 |
| Americas | 1987 | 0 |
| Americas | 1992 | 0 |
| Americas | 1997 | 0 |
| Americas | 2002 | 0 |
| Americas | 2007 | 0 |
| Asia | 1952 | 19 |
| Asia | 1957 | 15 |
| Asia | 1962 | 11 |
| Asia | 1967 | 5 |
| Asia | 1972 | 5 |
| Asia | 1977 | 3 |
| Asia | 1982 | 1 |
| Asia | 1987 | 1 |
| Asia | 1992 | 1 |
| Asia | 1997 | 1 |
| Asia | 2002 | 1 |
| Asia | 2007 | 1 |
| Europe | 1952 | 1 |
| Europe | 1957 | 0 |
| Europe | 1962 | 0 |
| Europe | 1967 | 0 |
| Europe | 1972 | 0 |
| Europe | 1977 | 0 |
| Europe | 1982 | 0 |
| Europe | 1987 | 0 |
| Europe | 1992 | 0 |
| Europe | 1997 | 0 |
| Europe | 2002 | 0 |
| Europe | 2007 | 0 |
| Oceania | 1952 | 0 |
| Oceania | 1957 | 0 |
| Oceania | 1962 | 0 |
| Oceania | 1967 | 0 |
| Oceania | 1972 | 0 |
| Oceania | 1977 | 0 |
| Oceania | 1982 | 0 |
| Oceania | 1987 | 0 |
| Oceania | 1992 | 0 |
| Oceania | 1997 | 0 |
| Oceania | 2002 | 0 |
| Oceania | 2007 | 0 |
foo <- daply(gDat, ~year + continent, function(x) {
jCount <- sum(x$lifeExp <= bMark)
jTotal <- nrow(x)
jProp <- jCount/jTotal
return(sprintf("%1.2f (%d/%d)", jProp, jCount, jTotal))
})
htmlPrint(foo, include.rownames = TRUE)
| Africa | Americas | Asia | Europe | Oceania | |
|---|---|---|---|---|---|
| 1952 | 0.90 (47/52) | 0.28 (7/25) | 0.58 (19/33) | 0.03 (1/30) | 0.00 (0/2) |
| 1957 | 0.87 (45/52) | 0.20 (5/25) | 0.45 (15/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1962 | 0.67 (35/52) | 0.08 (2/25) | 0.33 (11/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1967 | 0.56 (29/52) | 0.04 (1/25) | 0.15 (5/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1972 | 0.46 (24/52) | 0.00 (0/25) | 0.15 (5/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1977 | 0.31 (16/52) | 0.00 (0/25) | 0.09 (3/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1982 | 0.29 (15/52) | 0.00 (0/25) | 0.03 (1/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1987 | 0.17 (9/52) | 0.00 (0/25) | 0.03 (1/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1992 | 0.17 (9/52) | 0.00 (0/25) | 0.03 (1/33) | 0.00 (0/30) | 0.00 (0/2) |
| 1997 | 0.19 (10/52) | 0.00 (0/25) | 0.03 (1/33) | 0.00 (0/30) | 0.00 (0/2) |
| 2002 | 0.25 (13/52) | 0.00 (0/25) | 0.03 (1/33) | 0.00 (0/30) | 0.00 (0/2) |
| 2007 | 0.17 (9/52) | 0.00 (0/25) | 0.03 (1/33) | 0.00 (0/30) | 0.00 (0/2) |
table(gDat$continent, gDat$year == 1952)
##
## FALSE TRUE
## Africa 572 52
## Americas 275 25
## Asia 363 33
## Europe 330 30
## Oceania 22 2
(countCapByCont <- daply(gDat, .(continent, year), summarize, lowlifefreq = round(length(which(lowlifeExp ==
1))/length(continent), 2)))
## year
## continent 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
## Africa 1 1 1 0.98 0.96 0.96 0.88 0.79 0.77 0.85 0.79 0.79
## Americas 0.76 0.64 0.52 0.52 0.4 0.28 0.2 0.08 0.08 0.04 0.04 0
## Asia 0.91 0.82 0.79 0.76 0.61 0.48 0.36 0.3 0.24 0.21 0.15 0.09
## Europe 0.23 0.1 0.03 0.03 0.03 0.03 0 0 0 0 0 0
## Oceania 0 0 0 0 0 0 0 0 0 0 0 0
We can make a table using “xtable” package.
library(xtable)
jFun <- function(x) {
(yearMin <- min(gDat$year))
estCoefs <- coef(lm(lifeExp ~ I(year - yearMin), x))
names(estCoefs) <- c("intercept", "slope")
return(estCoefs)
}
jCoefs <- ddply(gDat, ~country, jFun)
str(jCoefs)
'data.frame': 142 obs. of 3 variables: $ country : Factor w/ 142 levels “Afghanistan”,..: 1 2 3 4 5 6 7 8 9 10 … $ intercept: num 29.9 59.2 43.4 32.1 62.7 … $ slope : num 0.275 0.335 0.569 0.209 0.232 …
set.seed(916)
foo <- jCoefs[sample(nrow(jCoefs), size = 15), ]
foo <- xtable(foo)
print(foo, type = "html", include.rownames = FALSE)
| country | intercept | slope |
|---|---|---|
| Lebanon | 58.69 | 0.26 |
| Senegal | 36.75 | 0.50 |
| Dominican Republic | 48.60 | 0.47 |
| Oman | 37.21 | 0.77 |
| Germany | 67.57 | 0.21 |
| Korea, Dem. Rep. | 54.91 | 0.32 |
| Mauritius | 55.37 | 0.35 |
| Slovak Republic | 67.01 | 0.13 |
| Comoros | 40.00 | 0.45 |
| Argentina | 62.69 | 0.23 |
| Central African Republic | 38.81 | 0.18 |
| Ecuador | 49.07 | 0.50 |
| West Bank and Gaza | 43.80 | 0.60 |
| Egypt | 40.97 | 0.56 |
| Myanmar | 41.41 | 0.43 |
# print.table(foo, type = 'latex', include.rownames = T)
It was easy (see above) to get the minimum life expectancy seen on each continent by year. Now try to report the year, the continent, the minimum (or max or whatever) life expectancy AND the country it pertains to. You could do something similar with GDP per capita or population (though result likely to be boring with population). Hint: read up on which.min() and which.max().
## this works but produces a frustrating long/tall result; not good for
## printing ddply(gDat, ~ year + continent, function(x) { theMin <-
## which.min(x$lifeExp) x[theMin, c('country', 'year', 'continent',
## 'lifeExp')] })
foo <- daply(gDat, ~year + continent, function(x) {
theMin <- which.min(x$lifeExp)
return(sprintf("%1.0f %s", x$lifeExp[theMin], x$country[theMin]))
})
htmlPrint(foo, include.rownames = TRUE)
| Africa | Americas | Asia | Europe | Oceania | |
|---|---|---|---|---|---|
| 1952 | 30 Gambia | 38 Haiti | 29 Afghanistan | 44 Turkey | 69 Australia |
| 1957 | 32 Sierra Leone | 41 Haiti | 30 Afghanistan | 48 Turkey | 70 New Zealand |
| 1962 | 33 Sierra Leone | 43 Bolivia | 32 Afghanistan | 52 Turkey | 71 Australia |
| 1967 | 34 Sierra Leone | 45 Bolivia | 34 Afghanistan | 54 Turkey | 71 Australia |
| 1972 | 35 Sierra Leone | 47 Bolivia | 36 Afghanistan | 57 Turkey | 72 New Zealand |
| 1977 | 37 Sierra Leone | 50 Haiti | 31 Cambodia | 60 Turkey | 72 New Zealand |
| 1982 | 38 Sierra Leone | 51 Haiti | 40 Afghanistan | 61 Turkey | 74 New Zealand |
| 1987 | 40 Angola | 54 Haiti | 41 Afghanistan | 63 Turkey | 74 New Zealand |
| 1992 | 24 Rwanda | 55 Haiti | 42 Afghanistan | 66 Turkey | 76 New Zealand |
| 1997 | 36 Rwanda | 57 Haiti | 42 Afghanistan | 69 Turkey | 78 New Zealand |
| 2002 | 39 Zambia | 58 Haiti | 42 Afghanistan | 71 Turkey | 79 New Zealand |
| 2007 | 40 Swaziland | 61 Haiti | 44 Afghanistan | 72 Turkey | 80 New Zealand |
— normal rules for padding in sprintf() achieved nothing; I guess due to what happens in an HTML table? as in, spaces mean nothing? is there any way to protect them, I wonder; if I pursue, here's the type of sprint() call I wanted: sprintf(“%-20s - %1.0f”, “canada”, 22) —>
Consider the linear regression we fit in tutorial of life expectancy vs. time. Find the min (and/or max) of the slope (and/or the intercept) within each continent. Report these interesting countries and some info about them in a data.frame. This might work for other variables too.
yearMin <- min(gDat$year)
jFun <- function(x) {
estCoefs <- coef(lm(lifeExp ~ I(year - yearMin), x))
names(estCoefs) <- c("intercept", "slope")
return(estCoefs)
}
jCoefs <- ddply(gDat, ~country + continent, jFun)
## giving out the Most Improved Award! going after maximum slope within
## continent
foo <- ddply(jCoefs, ~continent, function(x) {
theMax <- which.max(x$slope)
x[theMax, c("continent", "country", "intercept", "slope")]
})
htmlPrint(foo, digits = c(0, 0, 0, 0, 2))
| continent | country | intercept | slope |
|---|---|---|---|
| Africa | Libya | 42 | 0.63 |
| Americas | Nicaragua | 43 | 0.56 |
| Asia | Oman | 37 | 0.77 |
| Europe | Turkey | 46 | 0.50 |
| Oceania | Australia | 68 | 0.23 |
Sudden, substantial departures from the temporal trend is interesting. This goes for life expectancy, GDP per capita, or population. How might one operationalize this notion of “interesting”?
Make a data.frame of interesting countries, reporting their continent affiliation, and summary statistics on life expectancy (or GDP/capita or whatever you were looking at). Won't it be interesting to start to graphically explore them … next week.
yearMin <- min(gDat$year)
jFun <- function(x) {
jFit <- lm(lifeExp ~ I(year - yearMin), x)
jCoef <- coef(jFit)
names(jCoef) <- NULL
return(c(intercept = jCoef[1], slope = jCoef[2], maxResid = max(abs(resid(jFit)))/summary(jFit)$sigma))
}
maxResids <- ddply(gDat, ~country + continent, jFun)
foo <- ddply(maxResids, ~continent, function(x) {
theMax <- which.max(x$maxResid)
x[theMax, ]
})
htmlPrint(foo, digits = c(0, 0, 0, 2, 2, 2))
| country | continent | intercept | slope | maxResid |
|---|---|---|---|---|
| Rwanda | Africa | 42.74 | -0.05 | 2.64 |
| Ecuador | Americas | 49.07 | 0.50 | 2.25 |
| Cambodia | Asia | 37.02 | 0.40 | 2.79 |
| Finland | Europe | 66.45 | 0.24 | 2.72 |
| Australia | Oceania | 68.40 | 0.23 | 1.65 |
## I can't resist, I must make a plot
xyplot(lifeExp ~ year | country, gDat, subset = country %in% foo$country, type = c("p",
"r"))
Discuss: Is Ecuador really interesting? Do you think it's the most interesting country in the Americas? I doubt it. What could we do better? Making plots is ABSOLUTELY ESSENTIAL to sanity checking your claims and beliefs based on numerical computation.