Untitled

## Loading required package: grid

1 reshape

1.1 熔化

Melt的用法是 melt(data, id = c(“cut”, “color”, “clarity”))。下面使用reshape2中的airquality数据来演示。其结果如下

• 前n行是id (具体列数，依据id里有几项，如“cut”, “color”, “clarity” 三项)

• next列是variable

• 最后是value

##   ozone solar.r wind temp month day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3

##   month day variable value
## 1     5   1    ozone    41
## 2     5   2    ozone    36
## 3     5   3    ozone    12

##     month day variable value
## 610     9  28     temp    75
## 611     9  29     temp    76
## 612     9  30     temp    68

1.2 锻造

cast时，reshape2中有acast和dcast。acast可以cast 多维，当然也包括2维。但是与dcast相比，acast在cast二维dataframe时，少了第一列（也就是行名）。

• id ～variable：一个或是多个id vs variable。若是多个id vs variable，要用acast，因为结果是多维

dcast(aqm, month ~ variable, mean)

##   month ozone solar.r   wind  temp
## 1     5 23.62   181.3 11.623 65.55
## 2     6 29.44   190.2 10.267 79.10
## 3     7 59.12   216.5  8.942 83.90
## 4     8 59.96   171.9  8.794 83.97
## 5     9 31.45   167.4 10.180 76.90

dcast(aqm, month ~ variable, mean, margins = TRUE)

##   month ozone solar.r   wind  temp (all)
## 1     5 23.62   181.3 11.623 65.55 68.71
## 2     6 29.44   190.2 10.267 79.10 87.38
## 3     7 59.12   216.5  8.942 83.90 93.50
## 4     8 59.96   171.9  8.794 83.97 79.71
## 5     9 31.45   167.4 10.180 76.90 71.83
## 6 (all) 42.13   185.9  9.958 77.88 80.06

#Need to load ply package to use .() function to subset
dcast(aqm, month ~ variable, mean, subset = .(variable == "ozone"))

##   month ozone
## 1     5 23.62
## 2     6 29.44
## 3     7 59.12
## 4     8 59.96
## 5     9 31.45

# Switch Row and Column
dcast(aqm, variable ~ month, mean)

##   variable      5      6       7       8      9
## 1    ozone  23.62  29.44  59.115  59.962  31.45
## 2  solar.r 181.30 190.17 216.484 171.857 167.43
## 3     wind  11.62  10.27   8.942   8.794  10.18
## 4     temp  65.55  79.10  83.903  83.968  76.90

dcast(aqm, variable ~ month, mean, subset = .(month == 5))

##   variable      5
## 1    ozone  23.62
## 2  solar.r 181.30
## 3     wind  11.62
## 4     temp  65.55

1.3 锻造

使用Airquality Dat，比较reshape2，aggregate 和plyr的用法

# aggregate: 不需要melt 数据
aggregate(airquality, list(month=airquality$month), mean, na.rm=T)

##   month ozone solar.r   wind  temp month  day
## 1     5 23.62   181.3 11.623 65.55     5 16.0
## 2     6 29.44   190.2 10.267 79.10     6 15.5
## 3     7 59.12   216.5  8.942 83.90     7 16.0
## 4     8 59.96   171.9  8.794 83.97     8 16.0
## 5     9 31.45   167.4 10.180 76.90     9 15.5

# ddply + colwise效果相同，而且更powerful。
ddply(airquality, .(month), colwise(mean, na.rm=T))

##   month ozone solar.r   wind  temp  day
## 1     5 23.62   181.3 11.623 65.55 16.0
## 2     6 29.44   190.2 10.267 79.10 15.5
## 3     7 59.12   216.5  8.942 83.90 16.0
## 4     8 59.96   171.9  8.794 83.97 16.0
## 5     9 31.45   167.4 10.180 76.90 15.5

# reshape. When use melted data, you don't need to tell na.rm = T
dcast(aqm, month ~ variable, mean)

##   month ozone solar.r   wind  temp
## 1     5 23.62   181.3 11.623 65.55
## 2     6 29.44   190.2 10.267 79.10
## 3     7 59.12   216.5  8.942 83.90
## 4     8 59.96   171.9  8.794 83.97
## 5     9 31.45   167.4 10.180 76.90

# 传统ddply 使用 melted data. 输出是 "1D" instead of "2D" in aggregate and dcast output
df <- ddply(aqm, .(month, variable), summarize, mean=mean(value, na.rm=T))
head(df, 3); tail(df, 3)

##   month variable   mean
## 1     5    ozone  23.62
## 2     5  solar.r 181.30
## 3     5     wind  11.62

##    month variable   mean
## 18     9  solar.r 167.43
## 19     9     wind  10.18
## 20     9     temp  76.90

# 使用reshape命令把1D 转换成 2D
reshape(df, timevar = "variable", idvar = "month", direction = "wide")

##    month mean.ozone mean.solar.r mean.wind mean.temp
## 1      5      23.62        181.3    11.623     65.55
## 5      6      29.44        190.2    10.267     79.10
## 9      7      59.12        216.5     8.942     83.90
## 13     8      59.96        171.9     8.794     83.97
## 17     9      31.45        167.4    10.180     76.90

# ddply + melt seems very clumsy, however its power is unmatchable.
## multiple function 
df2 <- ddply(aqm, .(month, variable), summarize, mean=mean(value), sd=sd(value), CV=sd(value)/mean(value)*100)
head(df2, 3)

##   month variable   mean      sd    CV
## 1     5    ozone  23.62  22.224 94.11
## 2     5  solar.r 181.30 115.075 63.47
## 3     5     wind  11.62   3.531 30.38

1.4 使用alpha来凸显，数学公式 label =“LaTex 公式” LaTex的表达式annotate(label=“”)

p <-ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point()
m04 <- p + annotate("text", x = 3, y = 48, label = "Group 1", alpha = 0.5) +   
            annotate("text",x = 4.5,y = 66, label = "Group 2")

p <- ggplot(data.frame(x = c(-3, 3)), aes(x = x)) + stat_function(fun = dnorm) 
m05 <- p + annotate("text", x = 0, y = 0.05, parse = TRUE, size = 4, 
    label = "'Function: ' * y==frac(1, sqrt(2*pi)) * e^{-x^2/2}") 

multiplot(m04, m05)

plot of chunk unnamed-chunk-5

1.5 图中图

• 图中图: ggplotGrob

• 图中画: rasterGrob

• 图中表: tableGrob

library(jpeg)
p <- ggplot(mtcars, aes(x = wt, y = mpg)) + geom_point() 
fGrob <- ggplotGrob(p + theme(axis.title = element_blank()))
tGrob <- tableGrob(head(iris[ ,1:3]), size = 4)  
img <- readJPEG(system.file("img", "Rlogo.jpg", package="jpeg")) 
iGrob <- rasterGrob(img) 

p + annotation_custom(grob = fGrob, xmin = 1.5, xmax = 3.0, ymin = 10, ymax = 18) +       
    annotation_custom(grob = iGrob, xmin = 4.3, xmax = 5, ymin = 10, ymax = 15) +       
    annotation_custom(grob = tGrob, xmin = 3.5, xmax = 5.5, ymin = 27, ymax = 33)

plot of chunk unnamed-chunk-6

1.6 Facet

• geom_text好使，能够label individual的facet

• annotate只能把每个facet都label成一样的标签

• 把加进每一个facet中的labels做成dataframe

p <- ggplot(mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ drv) 
f_labels <- data.frame(drv = c("4", "f", "r"), label = c("4wd", "Front", "Rear")) 
p + geom_text(x = 6, y = 40, aes(label = label), data = f_labels)

plot of chunk unnamed-chunk-7

lm_labels <- function(dat) {     
    mod <- lm(hwy ~ displ, data = dat)     
    formula <- sprintf("italic(y) == %.2f %+.2f * italic(x)", 
        round(coef(mod)[1], 2), round(coef(mod)[2], 2))     
    r <- cor(dat$displ, dat$hwy)     
    r2 <- sprintf("italic(R^2) == %.2f", r^2)     
    data.frame(formula = formula, r2 = r2, stringsAsFactors = FALSE) 
} 

labels <- ddply(mpg, "drv", lm_labels) 
p + geom_smooth(method = lm, se = FALSE) + 
    geom_text(x = 3, y = 40, aes(label = formula), data = labels, parse = TRUE, hjust = 0， size = 2) + 
    geom_text(x = 3, y = 35, aes(label = r2), data = labels, parse = TRUE, hjust = 1, size = 2)

plot of chunk unnamed-chunk-7

1.7 Segment and Line

p <- ggplot(subset(climate, Source == "Berkeley"), aes(x = Year, y = Anomaly10y)) + geom_line()
m06 <- p + annotate("segment", x = 1850, xend = 1820, y = -0.8, yend = -0.95, 
        color = "blue", size = 0.5, arrow = arrow()) + 
    annotate("segment", x = 1950, xend = 1980, y = -0.25, yend = -0.25, 
        arrow = arrow(ends = "both", angle = 90, length = unit(0.2, "cm")))
m07 <- p + annotate("rect", xmin = 1950, xmax = 1980, ymin = -1, ymax = 1, alpha = 0.1, fill = "blue") 
multiplot(m06, m07)

plot of chunk unnamed-chunk-8

head(heightweight)

##   sex ageYear ageMonth heightIn weightLb
## 1   f   11.92      143     56.3     85.0
## 2   f   12.92      155     62.3    105.0
## 3   f   12.75      153     63.3    108.0
## 4   f   13.42      161     59.0     92.0
## 5   f   15.92      191     62.5    112.5
## 6   f   14.25      171     62.5    112.0

# Get the utf-8 number for f(102) and m(109)
utf <- unlist(lapply(c("f", "m"), utf8ToInt))
p <- ggplot(heightweight, aes(x=heightIn, y=weightLb, shape = sex, color =sex))   
p + geom_point(size = 3) + theme_bw() + scale_shape_manual(values = utf)

plot of chunk unnamed-chunk-9

# Compare: legend is not what in the figure
p <- ggplot(heightweight, aes(x=heightIn, y=weightLb, shape = sex, label =sex, color = sex))   
p + geom_text(size = 3)

plot of chunk unnamed-chunk-9

ggplot(menarche, aes(x=Age, y=Menarche/Total)) + stat_smooth(method="glm", family="binomial", se=F) + geom_point()

plot of chunk unnamed-chunk-9

Untitled

Junyu Lee

Sunday, June 22, 2014