Notes from GGPlot presentation

require(ggplot2)
## Loading required package: ggplot2

Generate a dataframe with a normal distribution

a <- rnorm(n = 100, mean = 0, sd = 3)
dist <- data.frame(a, b = 1:100)
head(dist)
##         a b
## 1 -0.7507 1
## 2 -0.8413 2
## 3  5.7515 3
## 4 -1.0319 4
## 5  0.5129 5
## 6  4.4668 6

Simple scatterplot

plot <- ggplot(dist, aes(x = a, y = b, colour = b))
plot <- plot + geom_point()
print(plot)

plot of chunk scatter

Add smooth line

plot + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.

plot of chunk smooth

Following along with video, transform

library(plyr)
load("~/deaths.rdata")

Describe data

head(deaths)
##    locD  yob mob dob sex age_unit age nation marital stateL countyL
## 1 1-0-0 1952   0   0   2        A  56      1       5      1       1
## 2 1-0-0 1937   0   0   1        A  71      1       5     32      15
## 3 1-0-0 1935   0   0   1        A  73      1       1      1       1
## 4 1-0-0 2008   0   0   1        H  97      1       8      1       1
## 5 1-1-0 1905   0   0   1        A 103      1       2      1       1
## 6 1-1-0 1923   0   0   2        A  85      1       2      1      10
##   locationL popL job edu derhab statD countyD locationD popD placeD  yod
## 1         1   15   2   2      2     1       0         0    0      3 2008
## 2        31    1  41   2      1     1       0         0    0      9 2008
## 3         1   15   2   7      3     1       0         0    0     11 2008
## 4         1   15  98   8      2     1       0         0    0      3 2008
## 5         1   15  41   2      2     1       1         0    0     11 2008
## 6        79    1   2   1      1     1       1         0    0      1 2008
##   mod dod hod minod med_help cod des presume working injury_loc domestic_v
## 1   1  28   8     0        1 E11   9       8       8         88          8
## 2   3  29   3     0        1 B69   9       8       8         88          8
## 3   8   6   8    40        1 D61   9       8       8         88          8
## 4   8  27   9     3        1 P61   4       8       8         88          8
## 5   1   2  22    30        1 E46   X       8       8         88          8
## 6   1   6  21     5        1 K56   6       8       8         88          8
##   autopsy certifier state_reg county_reg year_reg mon_reg day_reg weight
## 1       2         3         1          1     2008       2       1   8888
## 2       2         3        32         15     2008       4      21   8888
## 3       2         3         1          1     2008       8      28   8888
## 4       2         3         1          1     2008       8      29   2350
## 5       2         3         1          1     2008       1      10   8888
## 6       2         3         1         10     2008       1       8   8888
##   year_cert mon_cert day_cert pregnant labor_cod labor_c loc muni state
## 1      2008        1       28        8         8       8  NA   NA    NA
## 2      2008        4       18        8         8       8  NA   NA    NA
## 3      2008        8        6        8         8       8  NA   NA    NA
## 4      2008        8       27        8         8       8  NA   NA    NA
## 5      2008        1        3        8         8       8  NA   NA    NA
## 6      2008        1        6        8         8       8  NA   NA    NA
##   name lat long altitude death_date
## 1 <NA>  NA   NA       NA 2008-01-28
## 2 <NA>  NA   NA       NA 2008-03-29
## 3 <NA>  NA   NA       NA 2008-08-06
## 4 <NA>  NA   NA       NA 2008-08-27
## 5 <NA>  NA   NA       NA 2008-01-02
## 6 <NA>  NA   NA       NA 2008-01-06
head(codes)
##   cod                                                           disease
## 1 A00                                                           Cholera
## 2 A01                                    Typhoid and paratyphoid fevers
## 3 A02                                       Other salmonella infections
## 4 A03                                                       Shigellosis
## 5 A04                             Other bacterial intestinal infections
## 6 A05 Other bacterial foodborne intoxications, not elsewhere classified
##                                                              disease2
## 1                                                             Cholera
## 2                                     Typhoid and paratyphoid\nfevers
## 3                                         Other salmonella infections
## 4                                                         Shigellosis
## 5                              Other bacterial intestinal\ninfections
## 6 Other bacterial foodborne\nintoxications, not elsewhere\nclassified

So there is one table with all the deaths, and one connecting the short death codes to descriptions.

Let's see which causes are most frequent.

codefreq <- count(deaths, "cod")
head(codefreq)
##   cod freq
## 1 A01   52
## 2 A02   64
## 3 A03    7
## 4 A04  145
## 5 A05   21
## 6 A06   89

Let's sort by frequency

cause <- arrange(codefreq, desc(freq))
head(cause)
##   cod  freq
## 1 I21 48869
## 2 E11 43960
## 3 E14 28293
## 4 J44 16540
## 5 K70 13361
## 6 J18 13070

Join the frequency table with the code descriptions, and select top 20

cause <- join(cause, codes)
## Joining by: cod
cause20 <- head(cause, 20)
cause20
##    cod  freq                                                       disease
## 1  I21 48869                                   Acute myocardial infarction
## 2  E11 43960                       Non-insulin-dependent diabetes mellitus
## 3  E14 28293                                 Unspecified diabetes mellitus
## 4  J44 16540                   Other chronic obstructive pulmonary disease
## 5  K70 13361                                       Alcoholic liver disease
## 6  J18 13070                               Pneumonia, organism unspecified
## 7  K74 13017                               Fibrosis and cirrhosis of liver
## 8  I25 10413                                Chronic ischemic heart disease
## 9  X59  9211                                Exposure to unspecified factor
## 10 I50  8821                                                 Heart failure
## 11 X95  8640 Assault (homicide) by other and unspecified firearm discharge
## 12 N18  7899                                         Chronic renal failure
## 13 I67  7509                                Other cerebrovascular diseases
## 14 I61  6776                                      Intracerebral hemorrhage
## 15 C34  6705                       Malignant neoplasm of bronchus and lung
## 16 C16  5513                                 Malignant neoplasm of stomach
## 17 I64  5268             Stroke, not specified as hemorrhage or infarction
## 18 C61  5153                                Malignant neoplasm of prostate
## 19 I10  5090                              Essential (primary) hypertension
## 20 C22  5044       Malignant neoplasm of liver and intrahepatic bile ducts
##                                                           disease2
## 1                                      Acute myocardial infarction
## 2                         Non-insulin-dependent\ndiabetes mellitus
## 3                                    Unspecified diabetes mellitus
## 4                     Other chronic obstructive\npulmonary disease
## 5                                          Alcoholic liver disease
## 6                                 Pneumonia, organism\nunspecified
## 7                                 Fibrosis and cirrhosis of\nliver
## 8                                  Chronic ischemic heart\ndisease
## 9                                  Exposure to unspecified\nfactor
## 10                                                   Heart failure
## 11 Assault (homicide) by other\nand unspecified firearm\ndischarge
## 12                                           Chronic renal failure
## 13                                 Other cerebrovascular\ndiseases
## 14                                        Intracerebral hemorrhage
## 15                        Malignant neoplasm of\nbronchus and lung
## 16                                   Malignant neoplasm of stomach
## 17              Stroke, not specified as\nhemorrhage or infarction
## 18                                 Malignant neoplasm of\nprostate
## 19                               Essential (primary)\nhypertension
## 20        Malignant neoplasm of liver\nand intrahepatic bile ducts

Try to plot it

plot <- ggplot(cause20, aes(x = freq/1e+05, y = disease))
plot + geom_point()

plot of chunk plotdeaths

Not sure why it's not sorted by frequency, since cause20 was already sorted, but anyway. Will try again, also adding some more detail (scaling the X axis, adding legend).

plot <- ggplot(cause20, aes(x = freq/10000, y = reorder(disease, freq)))
pointplot <- plot + geom_point()
pointplot + scale_x_log10("deaths (x 10,000)", breaks = 1:5)

plot of chunk plotdeaths2