require(ggplot2)
## Loading required package: ggplot2
a <- rnorm(n = 100, mean = 0, sd = 3)
dist <- data.frame(a, b = 1:100)
head(dist)
## a b
## 1 -0.7507 1
## 2 -0.8413 2
## 3 5.7515 3
## 4 -1.0319 4
## 5 0.5129 5
## 6 4.4668 6
plot <- ggplot(dist, aes(x = a, y = b, colour = b))
plot <- plot + geom_point()
print(plot)
plot + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.
library(plyr)
load("~/deaths.rdata")
head(deaths)
## locD yob mob dob sex age_unit age nation marital stateL countyL
## 1 1-0-0 1952 0 0 2 A 56 1 5 1 1
## 2 1-0-0 1937 0 0 1 A 71 1 5 32 15
## 3 1-0-0 1935 0 0 1 A 73 1 1 1 1
## 4 1-0-0 2008 0 0 1 H 97 1 8 1 1
## 5 1-1-0 1905 0 0 1 A 103 1 2 1 1
## 6 1-1-0 1923 0 0 2 A 85 1 2 1 10
## locationL popL job edu derhab statD countyD locationD popD placeD yod
## 1 1 15 2 2 2 1 0 0 0 3 2008
## 2 31 1 41 2 1 1 0 0 0 9 2008
## 3 1 15 2 7 3 1 0 0 0 11 2008
## 4 1 15 98 8 2 1 0 0 0 3 2008
## 5 1 15 41 2 2 1 1 0 0 11 2008
## 6 79 1 2 1 1 1 1 0 0 1 2008
## mod dod hod minod med_help cod des presume working injury_loc domestic_v
## 1 1 28 8 0 1 E11 9 8 8 88 8
## 2 3 29 3 0 1 B69 9 8 8 88 8
## 3 8 6 8 40 1 D61 9 8 8 88 8
## 4 8 27 9 3 1 P61 4 8 8 88 8
## 5 1 2 22 30 1 E46 X 8 8 88 8
## 6 1 6 21 5 1 K56 6 8 8 88 8
## autopsy certifier state_reg county_reg year_reg mon_reg day_reg weight
## 1 2 3 1 1 2008 2 1 8888
## 2 2 3 32 15 2008 4 21 8888
## 3 2 3 1 1 2008 8 28 8888
## 4 2 3 1 1 2008 8 29 2350
## 5 2 3 1 1 2008 1 10 8888
## 6 2 3 1 10 2008 1 8 8888
## year_cert mon_cert day_cert pregnant labor_cod labor_c loc muni state
## 1 2008 1 28 8 8 8 NA NA NA
## 2 2008 4 18 8 8 8 NA NA NA
## 3 2008 8 6 8 8 8 NA NA NA
## 4 2008 8 27 8 8 8 NA NA NA
## 5 2008 1 3 8 8 8 NA NA NA
## 6 2008 1 6 8 8 8 NA NA NA
## name lat long altitude death_date
## 1 <NA> NA NA NA 2008-01-28
## 2 <NA> NA NA NA 2008-03-29
## 3 <NA> NA NA NA 2008-08-06
## 4 <NA> NA NA NA 2008-08-27
## 5 <NA> NA NA NA 2008-01-02
## 6 <NA> NA NA NA 2008-01-06
head(codes)
## cod disease
## 1 A00 Cholera
## 2 A01 Typhoid and paratyphoid fevers
## 3 A02 Other salmonella infections
## 4 A03 Shigellosis
## 5 A04 Other bacterial intestinal infections
## 6 A05 Other bacterial foodborne intoxications, not elsewhere classified
## disease2
## 1 Cholera
## 2 Typhoid and paratyphoid\nfevers
## 3 Other salmonella infections
## 4 Shigellosis
## 5 Other bacterial intestinal\ninfections
## 6 Other bacterial foodborne\nintoxications, not elsewhere\nclassified
So there is one table with all the deaths, and one connecting the short death codes to descriptions.
Let's see which causes are most frequent.
codefreq <- count(deaths, "cod")
head(codefreq)
## cod freq
## 1 A01 52
## 2 A02 64
## 3 A03 7
## 4 A04 145
## 5 A05 21
## 6 A06 89
Let's sort by frequency
cause <- arrange(codefreq, desc(freq))
head(cause)
## cod freq
## 1 I21 48869
## 2 E11 43960
## 3 E14 28293
## 4 J44 16540
## 5 K70 13361
## 6 J18 13070
Join the frequency table with the code descriptions, and select top 20
cause <- join(cause, codes)
## Joining by: cod
cause20 <- head(cause, 20)
cause20
## cod freq disease
## 1 I21 48869 Acute myocardial infarction
## 2 E11 43960 Non-insulin-dependent diabetes mellitus
## 3 E14 28293 Unspecified diabetes mellitus
## 4 J44 16540 Other chronic obstructive pulmonary disease
## 5 K70 13361 Alcoholic liver disease
## 6 J18 13070 Pneumonia, organism unspecified
## 7 K74 13017 Fibrosis and cirrhosis of liver
## 8 I25 10413 Chronic ischemic heart disease
## 9 X59 9211 Exposure to unspecified factor
## 10 I50 8821 Heart failure
## 11 X95 8640 Assault (homicide) by other and unspecified firearm discharge
## 12 N18 7899 Chronic renal failure
## 13 I67 7509 Other cerebrovascular diseases
## 14 I61 6776 Intracerebral hemorrhage
## 15 C34 6705 Malignant neoplasm of bronchus and lung
## 16 C16 5513 Malignant neoplasm of stomach
## 17 I64 5268 Stroke, not specified as hemorrhage or infarction
## 18 C61 5153 Malignant neoplasm of prostate
## 19 I10 5090 Essential (primary) hypertension
## 20 C22 5044 Malignant neoplasm of liver and intrahepatic bile ducts
## disease2
## 1 Acute myocardial infarction
## 2 Non-insulin-dependent\ndiabetes mellitus
## 3 Unspecified diabetes mellitus
## 4 Other chronic obstructive\npulmonary disease
## 5 Alcoholic liver disease
## 6 Pneumonia, organism\nunspecified
## 7 Fibrosis and cirrhosis of\nliver
## 8 Chronic ischemic heart\ndisease
## 9 Exposure to unspecified\nfactor
## 10 Heart failure
## 11 Assault (homicide) by other\nand unspecified firearm\ndischarge
## 12 Chronic renal failure
## 13 Other cerebrovascular\ndiseases
## 14 Intracerebral hemorrhage
## 15 Malignant neoplasm of\nbronchus and lung
## 16 Malignant neoplasm of stomach
## 17 Stroke, not specified as\nhemorrhage or infarction
## 18 Malignant neoplasm of\nprostate
## 19 Essential (primary)\nhypertension
## 20 Malignant neoplasm of liver\nand intrahepatic bile ducts
Try to plot it
plot <- ggplot(cause20, aes(x = freq/1e+05, y = disease))
plot + geom_point()
Not sure why it's not sorted by frequency, since cause20 was already sorted, but anyway. Will try again, also adding some more detail (scaling the X axis, adding legend).
plot <- ggplot(cause20, aes(x = freq/10000, y = reorder(disease, freq)))
pointplot <- plot + geom_point()
pointplot + scale_x_log10("deaths (x 10,000)", breaks = 1:5)