reshape2
intro for NCEAS SIlibrary("reshape2")
library("stringr")
sessionInfo()
## R Under development (unstable) (2013-05-22 r62774)
## Platform: i686-pc-linux-gnu (32-bit)
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=C LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_0.6.2 reshape2_1.2.2 knitr_1.2
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.3 evaluate_0.4.3 formatR_0.7 plyr_1.8
## [5] tools_3.1.0
ggplot2
)dat <- read.csv("renesting.csv")
head(dat)
## X.ACI.Interval..days..ACI. X1971 X1972 X1973 X1974 X1975 X1976 X1977
## 1 5 0 1 2 0 0 0 0
## 2 6 0 2 2 0 0 1 0
## 3 7 0 1 0 0 1 1 0
## 4 8 1 0 1 0 0 1 0
## 5 9 3 2 14 0 1 1 0
## 6 10 30 42 50 13 12 52 18
## X1978 X1979 X1980 X1981 X1982 X1983 X1984 X1985 X1986 X1987 X1988 X1989
## 1 0 0 1 1 3 1 1 4 2 0 1 1
## 2 2 1 2 0 3 2 4 1 3 0 0 0
## 3 1 2 4 1 1 0 2 1 3 0 3 4
## 4 2 1 2 1 1 2 0 1 2 0 2 0
## 5 20 6 48 16 20 33 11 6 40 3 98 12
## 6 196 36 360 73 187 227 91 86 309 72 415 109
## X1990 X1991 X1992 X.ACI.Row.Totals.ACI.
## 1 0 4 0 22
## 2 0 2 3 28
## 3 1 1 2 29
## 4 0 6 2 25
## 5 11 13 1 359
## 6 111 123 58 2670
(Column names mangled: we'll worry about this later)
names(dat)[1] <- "interval"
dat <- dat[1:(ncol(dat)-1)] ## drop last column (row totals)
(Could use grep("^X[0-9]{4}",names(dat))
to pull out the correct columns)
mdat1 <- melt(dat)
## Using interval as id variables
Message (not a warning) means that melt
has guessed that you only want to preserve the first column (rules: first column, or all factors (?))
head(mdat1)
## interval variable value
## 1 5 X1971 0
## 2 6 X1971 0
## 3 7 X1971 0
## 4 8 X1971 1
## 5 9 X1971 3
## 6 10 X1971 30
dcast(interval~variable)
## 'data.frame': 62 obs. of 23 variables:
## $ interval: Factor w/ 62 levels "10","11","12",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ X1971 : int 30 68 58 33 22 8 3 2 2 1 ...
## $ X1972 : int 42 158 189 102 45 24 8 3 1 1 ...
## $ X1973 : int 50 98 66 45 24 6 8 2 3 3 ...
## $ X1974 : int 13 22 35 26 18 10 0 0 2 0 ...
## ...
dcast(mdat1,variable~interval)
casts the other way (we've transposed the original data)acast()
recasts the variable as an arrayFormula magic: ~.
means “no variable”
head(dcast(mdat1,variable~.))
## Aggregation function missing: defaulting to length
## variable NA
## 1 X1971 62
## 2 X1972 62
## 3 X1973 62
## 4 X1974 62
## 5 X1975 62
## 6 X1976 62
Warning means that we have more than one datum per casting combination, so we have to do something with them … specify fun.aggregate
. (Sometimes we actually want to count – this is a roundabout way to replicate the functionality of table()
– but if we want length()
as the aggregation function, we should use it.)
mdat1B <- dcast(mdat1,variable~.,
fun.aggregate=mean,na.rm=TRUE)
head(mdat1B)
## variable NA
## 1 X1971 11.903
## 2 X1972 28.065
## 3 X1973 15.161
## 4 X1974 6.387
## 5 X1975 7.871
## 6 X1976 36.129
(Naming is messed up, I don't know how to set this automatically, but plyr::rename
and setNames
are useful.)
names(mdat1B)[2] <- "value"
as.numeric(gsub("^X","",as.character(value)))
or stringr::str_extract("[0-9]{4}",as.character(value))
check.names=FALSE
## year site1_temp site1_pH site2_temp site2_pH
## 1 2012 25 6.2 27 6.5
## 2 2013 20 6.1 29 5.4
(mdat2 <- melt(ex,id.var="year"))
## year variable value
## 1 2012 site1_temp 25.0
## 2 2013 site1_temp 20.0
## 3 2012 site1_pH 6.2
## 4 2013 site1_pH 6.1
## 5 2012 site2_temp 27.0
## 6 2013 site2_temp 29.0
## 7 2012 site2_pH 6.5
## 8 2013 site2_pH 5.4
We need to split the
(cc <- colsplit(mdat2$variable,"_",names=c("site","var")))
## site var
## 1 site1 temp
## 2 site1 temp
## 3 site1 pH
## 4 site1 pH
## 5 site2 temp
## 6 site2 temp
## 7 site2 pH
## 8 site2 pH
## use str_extract (could also use gsub() to get rid of the
## stuff we don't want)
cc$site <- as.numeric(str_extract(as.character(cc$site),
"[0-9]+"))
mdat2B <- with(mdat2,data.frame(year,cc,value))
Now we can recast (since we probably want site
and year
as ID variables, but not var
)
dcast(mdat2B,year+site~var)
## year site pH temp
## 1 2012 1 6.2 25
## 2 2012 2 6.5 27
## 3 2013 1 6.1 20
## 4 2013 2 5.4 29
...
variable