## Loading required package: grid
1 reshape
1.1 熔化
Melt的用法是 melt(data, id = c(“cut”, “color”, “clarity”))。下面使用reshape2中的airquality数据来演示。其结果如下
• 前n行是id (具体列数,依据id里有几项,如“cut”, “color”, “clarity” 三项)
• next列是variable
• 最后是value
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## month day variable value
## 1 5 1 ozone 41
## 2 5 2 ozone 36
## 3 5 3 ozone 12
## month day variable value
## 610 9 28 temp 75
## 611 9 29 temp 76
## 612 9 30 temp 68
1.2 锻造
cast时,reshape2中有acast和dcast。acast可以cast 多维,当然也包括2维。但是与dcast相比,acast在cast二维dataframe时,少了第一列(也就是行名)。
• id ~variable:一个或是多个id vs variable。若是多个id vs variable,要用acast,因为结果是多维
dcast(aqm, month ~ variable, mean)
## month ozone solar.r wind temp
## 1 5 23.62 181.3 11.623 65.55
## 2 6 29.44 190.2 10.267 79.10
## 3 7 59.12 216.5 8.942 83.90
## 4 8 59.96 171.9 8.794 83.97
## 5 9 31.45 167.4 10.180 76.90
dcast(aqm, month ~ variable, mean, margins = TRUE)
## month ozone solar.r wind temp (all)
## 1 5 23.62 181.3 11.623 65.55 68.71
## 2 6 29.44 190.2 10.267 79.10 87.38
## 3 7 59.12 216.5 8.942 83.90 93.50
## 4 8 59.96 171.9 8.794 83.97 79.71
## 5 9 31.45 167.4 10.180 76.90 71.83
## 6 (all) 42.13 185.9 9.958 77.88 80.06
#Need to load ply package to use .() function to subset
dcast(aqm, month ~ variable, mean, subset = .(variable == "ozone"))
## month ozone
## 1 5 23.62
## 2 6 29.44
## 3 7 59.12
## 4 8 59.96
## 5 9 31.45
# Switch Row and Column
dcast(aqm, variable ~ month, mean)
## variable 5 6 7 8 9
## 1 ozone 23.62 29.44 59.115 59.962 31.45
## 2 solar.r 181.30 190.17 216.484 171.857 167.43
## 3 wind 11.62 10.27 8.942 8.794 10.18
## 4 temp 65.55 79.10 83.903 83.968 76.90
dcast(aqm, variable ~ month, mean, subset = .(month == 5))
## variable 5
## 1 ozone 23.62
## 2 solar.r 181.30
## 3 wind 11.62
## 4 temp 65.55
1.3 锻造
使用Airquality Dat,比较reshape2,aggregate 和plyr的用法
# aggregate: 不需要melt 数据
aggregate(airquality, list(month=airquality$month), mean, na.rm=T)
## month ozone solar.r wind temp month day
## 1 5 23.62 181.3 11.623 65.55 5 16.0
## 2 6 29.44 190.2 10.267 79.10 6 15.5
## 3 7 59.12 216.5 8.942 83.90 7 16.0
## 4 8 59.96 171.9 8.794 83.97 8 16.0
## 5 9 31.45 167.4 10.180 76.90 9 15.5
# ddply + colwise效果相同,而且更powerful。
ddply(airquality, .(month), colwise(mean, na.rm=T))
## month ozone solar.r wind temp day
## 1 5 23.62 181.3 11.623 65.55 16.0
## 2 6 29.44 190.2 10.267 79.10 15.5
## 3 7 59.12 216.5 8.942 83.90 16.0
## 4 8 59.96 171.9 8.794 83.97 16.0
## 5 9 31.45 167.4 10.180 76.90 15.5
# reshape. When use melted data, you don't need to tell na.rm = T
dcast(aqm, month ~ variable, mean)
## month ozone solar.r wind temp
## 1 5 23.62 181.3 11.623 65.55
## 2 6 29.44 190.2 10.267 79.10
## 3 7 59.12 216.5 8.942 83.90
## 4 8 59.96 171.9 8.794 83.97
## 5 9 31.45 167.4 10.180 76.90
# 传统ddply 使用 melted data. 输出是 "1D" instead of "2D" in aggregate and dcast output
df <- ddply(aqm, .(month, variable), summarize, mean=mean(value, na.rm=T))
head(df, 3); tail(df, 3)
## month variable mean
## 1 5 ozone 23.62
## 2 5 solar.r 181.30
## 3 5 wind 11.62
## month variable mean
## 18 9 solar.r 167.43
## 19 9 wind 10.18
## 20 9 temp 76.90
# 使用reshape命令把1D 转换成 2D
reshape(df, timevar = "variable", idvar = "month", direction = "wide")
## month mean.ozone mean.solar.r mean.wind mean.temp
## 1 5 23.62 181.3 11.623 65.55
## 5 6 29.44 190.2 10.267 79.10
## 9 7 59.12 216.5 8.942 83.90
## 13 8 59.96 171.9 8.794 83.97
## 17 9 31.45 167.4 10.180 76.90
# ddply + melt seems very clumsy, however its power is unmatchable.
## multiple function
df2 <- ddply(aqm, .(month, variable), summarize, mean=mean(value), sd=sd(value), CV=sd(value)/mean(value)*100)
head(df2, 3)
## month variable mean sd CV
## 1 5 ozone 23.62 22.224 94.11
## 2 5 solar.r 181.30 115.075 63.47
## 3 5 wind 11.62 3.531 30.38
1.4 使用alpha来凸显,数学公式 label =“LaTex 公式” LaTex的表达式annotate(label=“”)
p <-ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point()
m04 <- p + annotate("text", x = 3, y = 48, label = "Group 1", alpha = 0.5) +
annotate("text",x = 4.5,y = 66, label = "Group 2")
p <- ggplot(data.frame(x = c(-3, 3)), aes(x = x)) + stat_function(fun = dnorm)
m05 <- p + annotate("text", x = 0, y = 0.05, parse = TRUE, size = 4,
label = "'Function: ' * y==frac(1, sqrt(2*pi)) * e^{-x^2/2}")
multiplot(m04, m05)
1.5 图中图
• 图中图: ggplotGrob
• 图中画: rasterGrob
• 图中表: tableGrob
library(jpeg)
p <- ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point()
fGrob <- ggplotGrob(p + theme(axis.title = element_blank()))
tGrob <- tableGrob(head(iris[ ,1:3]), size = 4)
img <- readJPEG(system.file("img", "Rlogo.jpg", package="jpeg"))
iGrob <- rasterGrob(img)
p + annotation_custom(grob = fGrob, xmin = 1.5, xmax = 3.0, ymin = 10, ymax = 18) +
annotation_custom(grob = iGrob, xmin = 4.3, xmax = 5, ymin = 10, ymax = 15) +
annotation_custom(grob = tGrob, xmin = 3.5, xmax = 5.5, ymin = 27, ymax = 33)
1.6 Facet
• geom_text好使,能够label individual的facet
• annotate只能把每个facet都label成一样的标签
• 把加进每一个facet中的labels做成dataframe
p <- ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ drv)
f_labels <- data.frame(drv = c("4", "f", "r"), label = c("4wd", "Front", "Rear"))
p + geom_text(x = 6, y = 40, aes(label = label), data = f_labels)
lm_labels <- function(dat) {
mod <- lm(hwy ~ displ, data = dat)
formula <- sprintf("italic(y) == %.2f %+.2f * italic(x)",
round(coef(mod)[1], 2), round(coef(mod)[2], 2))
r <- cor(dat$displ, dat$hwy)
r2 <- sprintf("italic(R^2) == %.2f", r^2)
data.frame(formula = formula, r2 = r2, stringsAsFactors = FALSE)
}
labels <- ddply(mpg, "drv", lm_labels)
p + geom_smooth(method = lm, se = FALSE) +
geom_text(x = 3, y = 40, aes(label = formula), data = labels, parse = TRUE, hjust = 0, size = 2) +
geom_text(x = 3, y = 35, aes(label = r2), data = labels, parse = TRUE, hjust = 1, size = 2)
1.7 Segment and Line
p <- ggplot(subset(climate, Source == "Berkeley"), aes(x = Year, y = Anomaly10y)) + geom_line()
m06 <- p + annotate("segment", x = 1850, xend = 1820, y = -0.8, yend = -0.95,
color = "blue", size = 0.5, arrow = arrow()) +
annotate("segment", x = 1950, xend = 1980, y = -0.25, yend = -0.25,
arrow = arrow(ends = "both", angle = 90, length = unit(0.2, "cm")))
m07 <- p + annotate("rect", xmin = 1950, xmax = 1980, ymin = -1, ymax = 1, alpha = 0.1, fill = "blue")
multiplot(m06, m07)
head(heightweight)
## sex ageYear ageMonth heightIn weightLb
## 1 f 11.92 143 56.3 85.0
## 2 f 12.92 155 62.3 105.0
## 3 f 12.75 153 63.3 108.0
## 4 f 13.42 161 59.0 92.0
## 5 f 15.92 191 62.5 112.5
## 6 f 14.25 171 62.5 112.0
# Get the utf-8 number for f(102) and m(109)
utf <- unlist(lapply(c("f", "m"), utf8ToInt))
p <- ggplot(heightweight, aes(x=heightIn, y=weightLb, shape = sex, color =sex))
p + geom_point(size = 3) + theme_bw() + scale_shape_manual(values = utf)
# Compare: legend is not what in the figure
p <- ggplot(heightweight, aes(x=heightIn, y=weightLb, shape = sex, label =sex, color = sex))
p + geom_text(size = 3)
ggplot(menarche, aes(x=Age, y=Menarche/Total)) + stat_smooth(method="glm", family="binomial", se=F) + geom_point()