library(plyr)
library(mgcv)
## Loading required package: nlme This is mgcv 1.7-26. For overview type
## 'help("mgcv-package")'.
library(lattice)
library(ggplot2)
gdURL <- "http://www.stat.ubc.ca/~jenny/notOcto/STAT545A/examples/gapminder/data/gapminderDataFiveYear.txt"
iDat <- read.table(gdURL, header = TRUE, sep = "\t", quote = "\"") #initial data, with Oceania
removeContinent <- "Oceania"
gDat <- droplevels(subset(iDat, continent != removeContinent))
str(gDat)
## 'data.frame': 1680 obs. of 6 variables:
## $ country : Factor w/ 140 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: Factor w/ 4 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
Load the data and get rid of the continent Oceania since it has only 2 countries
The following ddply and xyplot wizardry comes from Jenny's course notes
# modified from jenny's lattice solution to max and min per continent
lifeExpByContinent <- ddply(gDat, ~ continent + year, function(x) {
jLevels <- c("min", "max")
data.frame(lifeExp = range(x$lifeExp),
stat = factor(jLevels, levels = jLevels))
})
head(lifeExpByContinent)
## continent year lifeExp stat
## 1 Africa 1952 30.00 min
## 2 Africa 1952 52.72 max
## 3 Africa 1957 31.57 min
## 4 Africa 1957 58.09 max
## 5 Africa 1962 32.77 min
## 6 Africa 1962 60.25 max
# in lattice
xyplot(lifeExp ~ year | continent, lifeExpByContinent,
group = stat, type = "b", grid = "h", as.table = TRUE,
auto.key = list(columns = 2),
scales = list(y = list(log = TRUE, equispaced.log = FALSE)))
# same thing but with ggplot2
plt <- ggplot(lifeExpByContinent, aes(y=lifeExp, x=year, color=stat))
plt + geom_line() + geom_point() +facet_wrap(~continent)
This density plot uses geom_hex(), which I find easy on the eyes. The correlation between lifeExp and gdpPercap – or better yet log(gdpPercap) – is clear with a scatter plot. We use a LOESS curve with geom_smooth(method='loess'), since this is the option that is chosen with geom_smooth(method='auto') with so many data points; all the data makes the spread are a bit hard to see.
# density plot
plt <- ggplot(gDat, aes(x = gdpPercap, y = lifeExp))
plt + scale_x_log10() + geom_hex()
# scatter plot, showing raw data and spread
plt <- ggplot(gDat, aes(y = lifeExp, x = gdpPercap, color = continent))
plt + facet_wrap(~continent) + geom_point(shape = 1) + geom_smooth(method = "loess") +
scale_x_log10()
Kaggle hosts data analysis competitions. Real world problems are posted, and experts from around the world provide solutins to prediction and optimization challenges. The Featured competitions as of 07 October 2013 include:
# import kaggle diabetes data
kDat <- read.table("/Users/geoffrey/Downloads/kaggleDataSets/diabetes_trainingSet/training_SyncTranscript.csv",
header = TRUE, sep = ",", na.strings = c("NULL", 0, ""))
# type of physician
kClean <- kDat[(!is.na(kDat$PhysicianSpecialty)), ] # remove the NAs
problemFactor <- "Developmental – Behavioral Pediatrics"
kClean <- kClean[kClean$PhysicianSpecialty != problemFactor, ] # one factor level is giving a hard time, after googling around I suspect the encoding of the dash
plt <- ggplot(kClean, aes(x = reorder(PhysicianSpecialty, PhysicianSpecialty,
length)))
plt + geom_histogram() + scale_y_log10() + theme(axis.text.x = element_text(angle = 90,
hjust = 1)) + xlab("PhysicianSpecialty") # vertical text