library(ggplot2)
library(MASS) # used for robust regresion I think (rlm)
library(stringr) # to shorten disease descriptions
library(plyr) # apply operation to many segments
library(knitr)
knit2html("foo.Rmd", options = c("toc", markdown::markdownHTMLOptions(TRUE)))
## Warning: cannot open file 'foo.Rmd': No such file or directory
## Error: cannot open the connection
First I'm going to do some very simple playing around with ggplot.
a <- rnorm(n = 100, mean = 0, sd = 3)
dist <- data.frame(a, b = 1:100)
head(dist)
## a b
## 1 -0.8947 1
## 2 -0.3161 2
## 3 -1.5843 3
## 4 1.6067 4
## 5 -0.3949 5
## 6 3.7701 6
plot <- ggplot(dist, aes(x = a, y = b, colour = b))
plot <- plot + geom_point()
plot
plot + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.
load("~/deaths.rdata")
head(deaths)
## locD yob mob dob sex age_unit age nation marital stateL countyL
## 1 1-0-0 1952 0 0 2 A 56 1 5 1 1
## 2 1-0-0 1937 0 0 1 A 71 1 5 32 15
## 3 1-0-0 1935 0 0 1 A 73 1 1 1 1
## 4 1-0-0 2008 0 0 1 H 97 1 8 1 1
## 5 1-1-0 1905 0 0 1 A 103 1 2 1 1
## 6 1-1-0 1923 0 0 2 A 85 1 2 1 10
## locationL popL job edu derhab statD countyD locationD popD placeD yod
## 1 1 15 2 2 2 1 0 0 0 3 2008
## 2 31 1 41 2 1 1 0 0 0 9 2008
## 3 1 15 2 7 3 1 0 0 0 11 2008
## 4 1 15 98 8 2 1 0 0 0 3 2008
## 5 1 15 41 2 2 1 1 0 0 11 2008
## 6 79 1 2 1 1 1 1 0 0 1 2008
## mod dod hod minod med_help cod des presume working injury_loc domestic_v
## 1 1 28 8 0 1 E11 9 8 8 88 8
## 2 3 29 3 0 1 B69 9 8 8 88 8
## 3 8 6 8 40 1 D61 9 8 8 88 8
## 4 8 27 9 3 1 P61 4 8 8 88 8
## 5 1 2 22 30 1 E46 X 8 8 88 8
## 6 1 6 21 5 1 K56 6 8 8 88 8
## autopsy certifier state_reg county_reg year_reg mon_reg day_reg weight
## 1 2 3 1 1 2008 2 1 8888
## 2 2 3 32 15 2008 4 21 8888
## 3 2 3 1 1 2008 8 28 8888
## 4 2 3 1 1 2008 8 29 2350
## 5 2 3 1 1 2008 1 10 8888
## 6 2 3 1 10 2008 1 8 8888
## year_cert mon_cert day_cert pregnant labor_cod labor_c loc muni state
## 1 2008 1 28 8 8 8 NA NA NA
## 2 2008 4 18 8 8 8 NA NA NA
## 3 2008 8 6 8 8 8 NA NA NA
## 4 2008 8 27 8 8 8 NA NA NA
## 5 2008 1 3 8 8 8 NA NA NA
## 6 2008 1 6 8 8 8 NA NA NA
## name lat long altitude death_date
## 1 <NA> NA NA NA 2008-01-28
## 2 <NA> NA NA NA 2008-03-29
## 3 <NA> NA NA NA 2008-08-06
## 4 <NA> NA NA NA 2008-08-27
## 5 <NA> NA NA NA 2008-01-02
## 6 <NA> NA NA NA 2008-01-06
head(codes)
## cod disease
## 1 A00 Cholera
## 2 A01 Typhoid and paratyphoid fevers
## 3 A02 Other salmonella infections
## 4 A03 Shigellosis
## 5 A04 Other bacterial intestinal infections
## 6 A05 Other bacterial foodborne intoxications, not elsewhere classified
## disease2
## 1 Cholera
## 2 Typhoid and paratyphoid\nfevers
## 3 Other salmonella infections
## 4 Shigellosis
## 5 Other bacterial intestinal\ninfections
## 6 Other bacterial foodborne\nintoxications, not elsewhere\nclassified
So there is one table with all the deaths, and one connecting the short death codes to descriptions.
Let's see which causes are most frequent.
codefreq <- count(deaths, "cod")
head(codefreq)
## cod freq
## 1 A01 52
## 2 A02 64
## 3 A03 7
## 4 A04 145
## 5 A05 21
## 6 A06 89
cause <- arrange(codefreq, desc(freq))
head(cause)
## cod freq
## 1 I21 48869
## 2 E11 43960
## 3 E14 28293
## 4 J44 16540
## 5 K70 13361
## 6 J18 13070
Join the frequency table with the code descriptions, and select top 20
cause <- join(cause, codes)
## Joining by: cod
cause20 <- head(cause, 20)
cause20
## cod freq disease
## 1 I21 48869 Acute myocardial infarction
## 2 E11 43960 Non-insulin-dependent diabetes mellitus
## 3 E14 28293 Unspecified diabetes mellitus
## 4 J44 16540 Other chronic obstructive pulmonary disease
## 5 K70 13361 Alcoholic liver disease
## 6 J18 13070 Pneumonia, organism unspecified
## 7 K74 13017 Fibrosis and cirrhosis of liver
## 8 I25 10413 Chronic ischemic heart disease
## 9 X59 9211 Exposure to unspecified factor
## 10 I50 8821 Heart failure
## 11 X95 8640 Assault (homicide) by other and unspecified firearm discharge
## 12 N18 7899 Chronic renal failure
## 13 I67 7509 Other cerebrovascular diseases
## 14 I61 6776 Intracerebral hemorrhage
## 15 C34 6705 Malignant neoplasm of bronchus and lung
## 16 C16 5513 Malignant neoplasm of stomach
## 17 I64 5268 Stroke, not specified as hemorrhage or infarction
## 18 C61 5153 Malignant neoplasm of prostate
## 19 I10 5090 Essential (primary) hypertension
## 20 C22 5044 Malignant neoplasm of liver and intrahepatic bile ducts
## disease2
## 1 Acute myocardial infarction
## 2 Non-insulin-dependent\ndiabetes mellitus
## 3 Unspecified diabetes mellitus
## 4 Other chronic obstructive\npulmonary disease
## 5 Alcoholic liver disease
## 6 Pneumonia, organism\nunspecified
## 7 Fibrosis and cirrhosis of\nliver
## 8 Chronic ischemic heart\ndisease
## 9 Exposure to unspecified\nfactor
## 10 Heart failure
## 11 Assault (homicide) by other\nand unspecified firearm\ndischarge
## 12 Chronic renal failure
## 13 Other cerebrovascular\ndiseases
## 14 Intracerebral hemorrhage
## 15 Malignant neoplasm of\nbronchus and lung
## 16 Malignant neoplasm of stomach
## 17 Stroke, not specified as\nhemorrhage or infarction
## 18 Malignant neoplasm of\nprostate
## 19 Essential (primary)\nhypertension
## 20 Malignant neoplasm of liver\nand intrahepatic bile ducts
Try to plot it
plot <- ggplot(cause20, aes(x = freq/1e+05, y = disease))
plot + geom_point()
Not sure why it's not sorted by frequency, since cause20 was already sorted, but anyway. Will try again, also adding some more detail (scaling the X axis, adding legend).
plot <- ggplot(cause20, aes(x = freq/10000, y = reorder(disease, freq)))
pointplot <- plot + geom_point()
pointplot + scale_x_log10("deaths (x 10,000)", breaks = 1:5)
First some data cleaning, NAs are not plotted.
deaths$hod[deaths$hod == 99] <- NA
deaths$hod[deaths$hod == 24] <- NA
And lets make a frequency table, removing NAs
hodcount <- count(deaths, "hod")
print(head(hodcount))
## hod freq
## 1 0 20517
## 2 1 20430
## 3 2 18962
## 4 3 19729
## 5 4 20239
## 6 5 22126
hod <- subset(hodcount, !is.na(hod))
Then let's make our first simple line graph
plot <- ggplot(hod, aes(x = hod, y = freq))
plot + geom_line()
Let's split it into causes, and look at which causes of death have interesting distributions.
First, let's create a table with one line for number of deaths per hour for a given code.
hod2 <- count(deaths, c("cod", "hod"))
head(hod2)
## cod hod freq
## 1 A01 1 3
## 2 A01 2 1
## 3 A01 3 4
## 4 A01 5 5
## 5 A01 6 1
## 6 A01 8 1
Then let's apply a function to each row. This uses the dataframe hod2, grouping by the variable cod. It adds the column prop, which contains the percentage of deaths for this code which occurs at this time.
hod3 <- ddply(hod2, "cod", mutate, prop = freq/sum(freq))
head(hod3)
## cod hod freq prop
## 1 A01 1 3 0.05769
## 2 A01 2 1 0.01923
## 3 A01 3 4 0.07692
## 4 A01 5 5 0.09615
## 5 A01 6 1 0.01923
## 6 A01 8 1 0.01923
Let's set up a simple data.frame
names = c("Stian", "Stian", "Stian", "John", "John", "Sebastian")
numbers = c(1, 2, 3, 4, 1, 2)
test = data.frame(names, numbers)
Try to summarize by name
ddply(test, "names", summarize, N = length(numbers), mean = mean(numbers))
## names N mean
## 1 John 2 2.5
## 2 Sebastian 1 2.0
## 3 Stian 3 2.0
Mutation let's you do something to every row, giving you access to the other variables in that group, for example below, we show that 4 (row 1) makes up 80% of the 5 (something) that John has (two rows, 4 and 1)
ddply(test, "names", mutate, prop = numbers/sum(numbers) * 100)
## names numbers prop
## 1 John 4 80.00
## 2 John 1 20.00
## 3 Sebastian 2 100.00
## 4 Stian 1 16.67
## 5 Stian 2 33.33
## 6 Stian 3 50.00
Create a new table overall, which contains the frequencies of deaths, and proportions - ie. the same as what we did above, but now for the entire dataset.
overall <- ddply(hod3, "hod", summarize, freq_all = sum(freq))
overall <- mutate(overall, prop_all = freq_all/sum(freq_all))
head(overall)
## hod freq_all prop_all
## 1 0 20517 0.03803
## 2 1 20430 0.03787
## 3 2 18962 0.03515
## 4 3 19729 0.03657
## 5 4 20239 0.03751
## 6 5 22126 0.04101
Combine the time, so that each row has both the proportional mortality for that hour for that specific mortality cause, and the general mortality at that hour
hod4 <- join(overall, hod3, by = "hod")
head(hod4)
## hod freq_all prop_all cod freq prop
## 1 0 20517 0.03803 A02 1 0.01562
## 2 0 20517 0.03803 A04 6 0.04138
## 3 0 20517 0.03803 A06 5 0.05618
## 4 0 20517 0.03803 A09 91 0.02881
## 5 0 20517 0.03803 A15 7 0.03333
## 6 0 20517 0.03803 A16 53 0.03044
We want to come up with a distance metric between the average mortality and the specific cause mortality per time - to see if any causes significantly diverge from the average pattern.
We will compute a new frequency table, which uses average distance between specific and general mortality squared, as well as count, to be able to exclude causes of death with very few deaths. We'll then exclude causes with fewer than 50 deaths.
devi <- ddply(hod4, "cod", summarize, n = sum(freq), dist = mean((prop - prop_all)^2))
devi <- subset(devi, n > 50)
head(devi)
## cod n dist
## 1 A01 52 9.345e-04
## 2 A02 64 6.579e-04
## 4 A04 145 1.753e-04
## 6 A06 89 3.891e-04
## 9 A09 3159 3.014e-05
## 10 A15 210 1.980e-04
I want to try to plot this
ggplot(devi, aes(x = n, y = dist)) + geom_point()
… and we see that most of the variation comes from the codes with very few observations. For fun, I'm going to try that again, chopping of everything above 10,000 observations.
devi10k <- subset(devi, n > 10000)
ggplot(devi10k, aes(x = n, y = dist)) + geom_point()
We can also try to plot it on a double log scale (ie. we do log10 of the variable, and plot it on a log scale)
plot <- ggplot(devi, aes(x = n, y = log10(dist))) + geom_point() + scale_x_log10()
plot
In the video, he also showed how to add a line using rlm for robust regression (se is whether to display confidence interval or not)
plot + geom_smooth(method = "rlm", se = F)
plot + geom_smooth(method = "rlm", se = T)
Now we want to pick out the points in the top triangle, which are clearly divergent from the trend. Let's create a new column called residual, which is the distance from the point to the regression line, and select the points with residuals above 1.5. Let me first plot that, to see what it looks like.
devi$resid <- resid(rlm(log(dist) ~ log(n), data = devi))
head(devi)
## cod n dist resid
## 1 A01 52 9.345e-04 0.32472
## 2 A02 64 6.579e-04 0.14472
## 4 A04 145 1.753e-04 -0.50459
## 6 A06 89 3.891e-04 -0.10912
## 9 A09 3159 3.014e-05 0.27164
## 10 A15 210 1.980e-04 -0.07767
ggplot(devi, aes(x = n, y = log10(dist))) + geom_point(aes(colour = resid >
1.5)) + scale_x_log10() + geom_smooth(method = "rlm", se = F)
Now let's pick out unusual death distributions, separating by small (smaller than 350) and big. Note how we use match_df to pick in the columns from hod4, and not just the columns in devi.
hod_unusual <- subset(devi, resid > 1.5)
hod_unusualsml <- match_df(hod4, subset(hod_unusual, n < 350))
## Matching on: cod
hod_unusualbig <- match_df(hod4, subset(hod_unusual, n >= 350))
## Matching on: cod
We need to add the disease names, and make them shorter (copied from original code)
codesshort <- codes
codesshort$disease <- unlist(lapply(codes$disease, function(x) {
pieces <- strwrap(x, width = 30)
if (length(pieces) > 1)
str_c(pieces[1], "...") else pieces
}))
hod_unusualbigcod <- join(hod_unusualbig, codesshort, by = "cod")
head(hod_unusualbigcod)
## hod freq_all prop_all cod freq prop disease
## 1 0 20517 0.03803 R99 318 0.12710 Other ill-defined and...
## 2 0 20517 0.03803 V09 261 0.05864 Pedestrian injured in other...
## 3 0 20517 0.03803 V49 208 0.09420 Car occupant injured in other...
## 4 0 20517 0.03803 V87 177 0.06319 Traffic accident of specified...
## 5 0 20517 0.03803 V89 621 0.12673 Motor- or nonmotor-vehicle...
## 6 0 20517 0.03803 W69 33 0.06250 Drowning and submersion while...
## disease2
## 1 Other ill-defined and\nunspecified causes of\nmortality
## 2 Pedestrian injured in other\nand unspecified transport\naccidents
## 3 Car occupant injured in other\nand unspecified transport\naccidents
## 4 Traffic accident of specified\ntype but victim's mode of\ntransport unknown
## 5 Motor- or nonmotor-vehicle\naccident, type of vehicle\nunspecified
## 6 Drowning and submersion while\nin natural water
Let's graph the less frequent group side-by-side.
plot <- ggplot(hod_unusualbigcod, aes(hod, prop)) + geom_line() + facet_wrap(~disease,
ncol = 3)
plot
Let's add a line on each graph with the average deaths per hour
plot + geom_line(aes(y = prop_all), data = overall, colour = "grey50")