Notes from GGPlot presentation

library(ggplot2)
library(MASS)  # used for robust regresion I think (rlm)
library(stringr)  # to shorten disease descriptions
library(plyr)  # apply operation to many segments
library(knitr)
knit2html("foo.Rmd", options = c("toc", markdown::markdownHTMLOptions(TRUE)))

## Warning: cannot open file 'foo.Rmd': No such file or directory

## Error: cannot open the connection

Playing around with ggplot

First I'm going to do some very simple playing around with ggplot.

Generate a dataframe with a normal distribution

a <- rnorm(n = 100, mean = 0, sd = 3)
dist <- data.frame(a, b = 1:100)
head(dist)

##         a b
## 1 -0.8947 1
## 2 -0.3161 2
## 3 -1.5843 3
## 4  1.6067 4
## 5 -0.3949 5
## 6  3.7701 6

Simple scatterplot

plot <- ggplot(dist, aes(x = a, y = b, colour = b))
plot <- plot + geom_point()
plot

plot of chunk scatter

Add smooth line

plot + geom_smooth()

## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.

plot of chunk smooth

Following along with video, transform

Video, data

load("~/deaths.rdata")

Describe data

head(deaths)

##    locD  yob mob dob sex age_unit age nation marital stateL countyL
## 1 1-0-0 1952   0   0   2        A  56      1       5      1       1
## 2 1-0-0 1937   0   0   1        A  71      1       5     32      15
## 3 1-0-0 1935   0   0   1        A  73      1       1      1       1
## 4 1-0-0 2008   0   0   1        H  97      1       8      1       1
## 5 1-1-0 1905   0   0   1        A 103      1       2      1       1
## 6 1-1-0 1923   0   0   2        A  85      1       2      1      10
##   locationL popL job edu derhab statD countyD locationD popD placeD  yod
## 1         1   15   2   2      2     1       0         0    0      3 2008
## 2        31    1  41   2      1     1       0         0    0      9 2008
## 3         1   15   2   7      3     1       0         0    0     11 2008
## 4         1   15  98   8      2     1       0         0    0      3 2008
## 5         1   15  41   2      2     1       1         0    0     11 2008
## 6        79    1   2   1      1     1       1         0    0      1 2008
##   mod dod hod minod med_help cod des presume working injury_loc domestic_v
## 1   1  28   8     0        1 E11   9       8       8         88          8
## 2   3  29   3     0        1 B69   9       8       8         88          8
## 3   8   6   8    40        1 D61   9       8       8         88          8
## 4   8  27   9     3        1 P61   4       8       8         88          8
## 5   1   2  22    30        1 E46   X       8       8         88          8
## 6   1   6  21     5        1 K56   6       8       8         88          8
##   autopsy certifier state_reg county_reg year_reg mon_reg day_reg weight
## 1       2         3         1          1     2008       2       1   8888
## 2       2         3        32         15     2008       4      21   8888
## 3       2         3         1          1     2008       8      28   8888
## 4       2         3         1          1     2008       8      29   2350
## 5       2         3         1          1     2008       1      10   8888
## 6       2         3         1         10     2008       1       8   8888
##   year_cert mon_cert day_cert pregnant labor_cod labor_c loc muni state
## 1      2008        1       28        8         8       8  NA   NA    NA
## 2      2008        4       18        8         8       8  NA   NA    NA
## 3      2008        8        6        8         8       8  NA   NA    NA
## 4      2008        8       27        8         8       8  NA   NA    NA
## 5      2008        1        3        8         8       8  NA   NA    NA
## 6      2008        1        6        8         8       8  NA   NA    NA
##   name lat long altitude death_date
## 1 <NA>  NA   NA       NA 2008-01-28
## 2 <NA>  NA   NA       NA 2008-03-29
## 3 <NA>  NA   NA       NA 2008-08-06
## 4 <NA>  NA   NA       NA 2008-08-27
## 5 <NA>  NA   NA       NA 2008-01-02
## 6 <NA>  NA   NA       NA 2008-01-06

head(codes)

##   cod                                                           disease
## 1 A00                                                           Cholera
## 2 A01                                    Typhoid and paratyphoid fevers
## 3 A02                                       Other salmonella infections
## 4 A03                                                       Shigellosis
## 5 A04                             Other bacterial intestinal infections
## 6 A05 Other bacterial foodborne intoxications, not elsewhere classified
##                                                              disease2
## 1                                                             Cholera
## 2                                     Typhoid and paratyphoid\nfevers
## 3                                         Other salmonella infections
## 4                                                         Shigellosis
## 5                              Other bacterial intestinal\ninfections
## 6 Other bacterial foodborne\nintoxications, not elsewhere\nclassified

So there is one table with all the deaths, and one connecting the short death codes to descriptions.

Let's see which causes are most frequent.

codefreq <- count(deaths, "cod")
head(codefreq)

##   cod freq
## 1 A01   52
## 2 A02   64
## 3 A03    7
## 4 A04  145
## 5 A05   21
## 6 A06   89

Sort by frequency

cause <- arrange(codefreq, desc(freq))
head(cause)

##   cod  freq
## 1 I21 48869
## 2 E11 43960
## 3 E14 28293
## 4 J44 16540
## 5 K70 13361
## 6 J18 13070

Join the frequency table with the code descriptions, and select top 20

cause <- join(cause, codes)

## Joining by: cod

cause20 <- head(cause, 20)
cause20

##    cod  freq                                                       disease
## 1  I21 48869                                   Acute myocardial infarction
## 2  E11 43960                       Non-insulin-dependent diabetes mellitus
## 3  E14 28293                                 Unspecified diabetes mellitus
## 4  J44 16540                   Other chronic obstructive pulmonary disease
## 5  K70 13361                                       Alcoholic liver disease
## 6  J18 13070                               Pneumonia, organism unspecified
## 7  K74 13017                               Fibrosis and cirrhosis of liver
## 8  I25 10413                                Chronic ischemic heart disease
## 9  X59  9211                                Exposure to unspecified factor
## 10 I50  8821                                                 Heart failure
## 11 X95  8640 Assault (homicide) by other and unspecified firearm discharge
## 12 N18  7899                                         Chronic renal failure
## 13 I67  7509                                Other cerebrovascular diseases
## 14 I61  6776                                      Intracerebral hemorrhage
## 15 C34  6705                       Malignant neoplasm of bronchus and lung
## 16 C16  5513                                 Malignant neoplasm of stomach
## 17 I64  5268             Stroke, not specified as hemorrhage or infarction
## 18 C61  5153                                Malignant neoplasm of prostate
## 19 I10  5090                              Essential (primary) hypertension
## 20 C22  5044       Malignant neoplasm of liver and intrahepatic bile ducts
##                                                           disease2
## 1                                      Acute myocardial infarction
## 2                         Non-insulin-dependent\ndiabetes mellitus
## 3                                    Unspecified diabetes mellitus
## 4                     Other chronic obstructive\npulmonary disease
## 5                                          Alcoholic liver disease
## 6                                 Pneumonia, organism\nunspecified
## 7                                 Fibrosis and cirrhosis of\nliver
## 8                                  Chronic ischemic heart\ndisease
## 9                                  Exposure to unspecified\nfactor
## 10                                                   Heart failure
## 11 Assault (homicide) by other\nand unspecified firearm\ndischarge
## 12                                           Chronic renal failure
## 13                                 Other cerebrovascular\ndiseases
## 14                                        Intracerebral hemorrhage
## 15                        Malignant neoplasm of\nbronchus and lung
## 16                                   Malignant neoplasm of stomach
## 17              Stroke, not specified as\nhemorrhage or infarction
## 18                                 Malignant neoplasm of\nprostate
## 19                               Essential (primary)\nhypertension
## 20        Malignant neoplasm of liver\nand intrahepatic bile ducts

Try to plot it

plot <- ggplot(cause20, aes(x = freq/1e+05, y = disease))
plot + geom_point()

plot of chunk plotdeaths

Not sure why it's not sorted by frequency, since cause20 was already sorted, but anyway. Will try again, also adding some more detail (scaling the X axis, adding legend).

plot <- ggplot(cause20, aes(x = freq/10000, y = reorder(disease, freq)))
pointplot <- plot + geom_point()
pointplot + scale_x_log10("deaths (x 10,000)", breaks = 1:5)

plot of chunk plotdeaths2

Time of death

First some data cleaning, NAs are not plotted.

deaths$hod[deaths$hod == 99] <- NA
deaths$hod[deaths$hod == 24] <- NA

And lets make a frequency table, removing NAs

hodcount <- count(deaths, "hod")
print(head(hodcount))

##   hod  freq
## 1   0 20517
## 2   1 20430
## 3   2 18962
## 4   3 19729
## 5   4 20239
## 6   5 22126

hod <- subset(hodcount, !is.na(hod))

Then let's make our first simple line graph

plot <- ggplot(hod, aes(x = hod, y = freq))
plot + geom_line()

plot of chunk hod-line

Let's split it into causes, and look at which causes of death have interesting distributions.

First, let's create a table with one line for number of deaths per hour for a given code.

hod2 <- count(deaths, c("cod", "hod"))
head(hod2)

##   cod hod freq
## 1 A01   1    3
## 2 A01   2    1
## 3 A01   3    4
## 4 A01   5    5
## 5 A01   6    1
## 6 A01   8    1

Then let's apply a function to each row. This uses the dataframe hod2, grouping by the variable cod. It adds the column prop, which contains the percentage of deaths for this code which occurs at this time.

hod3 <- ddply(hod2, "cod", mutate, prop = freq/sum(freq))
head(hod3)

##   cod hod freq    prop
## 1 A01   1    3 0.05769
## 2 A01   2    1 0.01923
## 3 A01   3    4 0.07692
## 4 A01   5    5 0.09615
## 5 A01   6    1 0.01923
## 6 A01   8    1 0.01923

ddply examples - we interrupt our programming

Let's set up a simple data.frame

names = c("Stian", "Stian", "Stian", "John", "John", "Sebastian")
numbers = c(1, 2, 3, 4, 1, 2)
test = data.frame(names, numbers)

Try to summarize by name

ddply(test, "names", summarize, N = length(numbers), mean = mean(numbers))

##       names N mean
## 1      John 2  2.5
## 2 Sebastian 1  2.0
## 3     Stian 3  2.0

Mutation let's you do something to every row, giving you access to the other variables in that group, for example below, we show that 4 (row 1) makes up 80% of the 5 (something) that John has (two rows, 4 and 1)

ddply(test, "names", mutate, prop = numbers/sum(numbers) * 100)

##       names numbers   prop
## 1      John       4  80.00
## 2      John       1  20.00
## 3 Sebastian       2 100.00
## 4     Stian       1  16.67
## 5     Stian       2  33.33
## 6     Stian       3  50.00

Back to the example

Create a new table overall, which contains the frequencies of deaths, and proportions - ie. the same as what we did above, but now for the entire dataset.

overall <- ddply(hod3, "hod", summarize, freq_all = sum(freq))
overall <- mutate(overall, prop_all = freq_all/sum(freq_all))
head(overall)

##   hod freq_all prop_all
## 1   0    20517  0.03803
## 2   1    20430  0.03787
## 3   2    18962  0.03515
## 4   3    19729  0.03657
## 5   4    20239  0.03751
## 6   5    22126  0.04101

Combine the time, so that each row has both the proportional mortality for that hour for that specific mortality cause, and the general mortality at that hour

hod4 <- join(overall, hod3, by = "hod")
head(hod4)

##   hod freq_all prop_all cod freq    prop
## 1   0    20517  0.03803 A02    1 0.01562
## 2   0    20517  0.03803 A04    6 0.04138
## 3   0    20517  0.03803 A06    5 0.05618
## 4   0    20517  0.03803 A09   91 0.02881
## 5   0    20517  0.03803 A15    7 0.03333
## 6   0    20517  0.03803 A16   53 0.03044

Create a model, find outliers

We want to come up with a distance metric between the average mortality and the specific cause mortality per time - to see if any causes significantly diverge from the average pattern.

We will compute a new frequency table, which uses average distance between specific and general mortality squared, as well as count, to be able to exclude causes of death with very few deaths. We'll then exclude causes with fewer than 50 deaths.

devi <- ddply(hod4, "cod", summarize, n = sum(freq), dist = mean((prop - prop_all)^2))
devi <- subset(devi, n > 50)
head(devi)

##    cod    n      dist
## 1  A01   52 9.345e-04
## 2  A02   64 6.579e-04
## 4  A04  145 1.753e-04
## 6  A06   89 3.891e-04
## 9  A09 3159 3.014e-05
## 10 A15  210 1.980e-04

I want to try to plot this

ggplot(devi, aes(x = n, y = dist)) + geom_point()

plot of chunk unnamed-chunk-10

… and we see that most of the variation comes from the codes with very few observations. For fun, I'm going to try that again, chopping of everything above 10,000 observations.

devi10k <- subset(devi, n > 10000)
ggplot(devi10k, aes(x = n, y = dist)) + geom_point()

plot of chunk unnamed-chunk-11

We can also try to plot it on a double log scale (ie. we do log10 of the variable, and plot it on a log scale)

plot <- ggplot(devi, aes(x = n, y = log10(dist))) + geom_point() + scale_x_log10()
plot

plot of chunk unnamed-chunk-12

In the video, he also showed how to add a line using rlm for robust regression (se is whether to display confidence interval or not)

plot + geom_smooth(method = "rlm", se = F)

plot of chunk unnamed-chunk-13

plot + geom_smooth(method = "rlm", se = T)

plot of chunk unnamed-chunk-13

Now we want to pick out the points in the top triangle, which are clearly divergent from the trend. Let's create a new column called residual, which is the distance from the point to the regression line, and select the points with residuals above 1.5. Let me first plot that, to see what it looks like.

devi$resid <- resid(rlm(log(dist) ~ log(n), data = devi))
head(devi)

##    cod    n      dist    resid
## 1  A01   52 9.345e-04  0.32472
## 2  A02   64 6.579e-04  0.14472
## 4  A04  145 1.753e-04 -0.50459
## 6  A06   89 3.891e-04 -0.10912
## 9  A09 3159 3.014e-05  0.27164
## 10 A15  210 1.980e-04 -0.07767

ggplot(devi, aes(x = n, y = log10(dist))) + geom_point(aes(colour = resid > 
    1.5)) + scale_x_log10() + geom_smooth(method = "rlm", se = F)

plot of chunk unnamed-chunk-14

Now let's pick out unusual death distributions, separating by small (smaller than 350) and big. Note how we use match_df to pick in the columns from hod4, and not just the columns in devi.

hod_unusual <- subset(devi, resid > 1.5)
hod_unusualsml <- match_df(hod4, subset(hod_unusual, n < 350))

## Matching on: cod

hod_unusualbig <- match_df(hod4, subset(hod_unusual, n >= 350))

## Matching on: cod

We need to add the disease names, and make them shorter (copied from original code)

codesshort <- codes
codesshort$disease <- unlist(lapply(codes$disease, function(x) {
    pieces <- strwrap(x, width = 30)
    if (length(pieces) > 1) 
        str_c(pieces[1], "...") else pieces
}))
hod_unusualbigcod <- join(hod_unusualbig, codesshort, by = "cod")
head(hod_unusualbigcod)

##   hod freq_all prop_all cod freq    prop                          disease
## 1   0    20517  0.03803 R99  318 0.12710         Other ill-defined and...
## 2   0    20517  0.03803 V09  261 0.05864   Pedestrian injured in other...
## 3   0    20517  0.03803 V49  208 0.09420 Car occupant injured in other...
## 4   0    20517  0.03803 V87  177 0.06319 Traffic accident of specified...
## 5   0    20517  0.03803 V89  621 0.12673    Motor- or nonmotor-vehicle...
## 6   0    20517  0.03803 W69   33 0.06250 Drowning and submersion while...
##                                                                      disease2
## 1                     Other ill-defined and\nunspecified causes of\nmortality
## 2           Pedestrian injured in other\nand unspecified transport\naccidents
## 3         Car occupant injured in other\nand unspecified transport\naccidents
## 4 Traffic accident of specified\ntype but victim's mode of\ntransport unknown
## 5          Motor- or nonmotor-vehicle\naccident, type of vehicle\nunspecified
## 6                             Drowning and submersion while\nin natural water

Let's graph the less frequent group side-by-side.

plot <- ggplot(hod_unusualbigcod, aes(hod, prop)) + geom_line() + facet_wrap(~disease, 
    ncol = 3)
plot

plot of chunk unnamed-chunk-17

Let's add a line on each graph with the average deaths per hour

plot + geom_line(aes(y = prop_all), data = overall, colour = "grey50")

plot of chunk unnamed-chunk-18