ggplot2-第五章散点图（二）

# Multiple plot function ggplot objects can be passed in ..., or to plotlist
# (as a list of ggplot objects) - cols: Number of columns in layout -
# layout: A matrix specifying the layout. If present, 'cols' is ignored.  If
# the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then
# plot 1 will go in the upper left, 2 will go in the upper right, and 3 will
# go all the way across the bottom.
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
    library(grid)
    
    # Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)
    
    numPlots = length(plots)
    
    # If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        # Make the panel ncol: Number of columns of plots nrow: Number of rows
        # needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), ncol = cols, 
            nrow = ceiling(numPlots/cols))
    }
    
    if (numPlots == 1) {
        print(plots[[1]])
        
    } else {
        # Set up the page
        grid.newpage()
        pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
        
        # Make each plot, in the correct location
        for (i in 1:numPlots) {
            # Get the i,j matrix positions of the regions that contain this subplot
            matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
            
            print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col))
        }
    }
}
# multiplot(p1, p2, p3, p4, cols=2) gridExtra

# install.packages('gcookbook')
library(ggplot2)
library(gcookbook)
library(plyr)

sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn))
sp + geom_point() + stat_smooth(method = lm)

sp + geom_point() + stat_smooth(method = lm, se = FALSE)

sp + geom_point() + stat_smooth(method = lm, colour = "red", size = 1, level = 0.99)

sp + geom_point() + stat_smooth(method = lm, colour = "red", size = 1, level = 0.99, 
    fill = "blue", alpha = 0.5)

sp + geom_point() + stat_smooth(method = loess)

拟合逻辑回归

MASS包中的数据集biopsy，包含9个与乳腺癌活检组织相关的指标以及肿瘤的分类，包括良性（benign）和恶性（malignant）

数据点重叠严重，需要向数据点添加一些扰动，将数据点设置为半透明、点形设置为空心圆，并使用略小的额数据点

library(MASS)  # for dataset biopsy
b <- biopsy
b$classn[b$class == "benign"] <- 0
b$classn[b$class == "malignant"] <- 1
head(b)

##        ID V1 V2 V3 V4 V5 V6 V7 V8 V9     class classn
## 1 1000025  5  1  1  1  2  1  3  1  1    benign      0
## 2 1002945  5  4  4  5  7 10  3  2  1    benign      0
## 3 1015425  3  1  1  1  2  2  3  1  1    benign      0
## 4 1016277  6  8  8  1  3  4  3  7  1    benign      0
## 5 1017023  4  1  1  3  2  1  3  1  1    benign      0
## 6 1017122  8 10 10  8  7 10  9  7  1 malignant      1

ggplot(b, aes(x = V1, y = classn)) + geom_point(position = position_jitter(width = 0.3, 
    height = 0.06), alpha = 0.4, shape = 21, size = 1.5) + stat_smooth(method = glm, 
    method.args = list(family = binomial))

5.7 添加已存在模型的拟合线

library(gcookbook)
model <- lm(heightIn ~ ageYear + I(ageYear^2), heightweight)
model

## 
## Call:
## lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)
## 
## Coefficients:
##  (Intercept)       ageYear  I(ageYear^2)  
##     -10.3136        8.6673       -0.2478

根据模型计算预测值：

xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear = seq(xmin, xmax, length.out = 100))
predicted$heightIn <- predict(model, predicted)
head(predicted)

##    ageYear heightIn
## 1 11.58000 56.82624
## 2 11.63980 57.00047
## 3 11.69960 57.17294
## 4 11.75939 57.34363
## 5 11.81919 57.51255
## 6 11.87899 57.67969

sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point(colour = "grey60")
sp + geom_line(data = predicted, size = 1)

predictvals <- function(model, xvar, yvar, xrange = NULL, samples = 100, ...) {
    if (is.null(xrange)) {
        if (any(class(model) %in% c("lm", "glm"))) 
            xrange <- range(model$model[[xvar]]) else if (any(class(model) %in% "loess")) 
            xrange <- range(model$x)
    }
    newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
    names(newdata) <- xvar
    newdata[[yvar]] <- predict(model, newdata = newdata, ...)
    newdata
}

modlinear <- lm(heightIn ~ ageYear, heightweight)
modloess <- loess(heightIn ~ ageYear, heightweight)
lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
sp + geom_line(data = lm_predicted, colour = "red", size = 0.8) + geom_line(data = loess_predicted, 
    colour = "blue", size = 0.8)

5.8 添加已存在的多个模型的拟合线

make_model <- function(data) {
    lm(heightIn ~ ageYear, data)
}
library(plyr)
models <- dlply(heightweight, "sex", .fun = make_model)
models

## $f
## 
## Call:
## lm(formula = heightIn ~ ageYear, data = data)
## 
## Coefficients:
## (Intercept)      ageYear  
##      43.963        1.209  
## 
## 
## $m
## 
## Call:
## lm(formula = heightIn ~ ageYear, data = data)
## 
## Coefficients:
## (Intercept)      ageYear  
##      30.658        2.301  
## 
## 
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
##   sex
## 1   f
## 2   m

predvals <- ldply(models, .fun = predictvals, xvar = "ageYear", "heightIn")
head(predvals)

##   sex  ageYear heightIn
## 1   f 11.58000 57.96250
## 2   f 11.63980 58.03478
## 3   f 11.69960 58.10707
## 4   f 11.75939 58.17936
## 5   f 11.81919 58.25165
## 6   f 11.87899 58.32394

tail(predvals)

##     sex  ageYear heightIn
## 195   m 16.88768 69.51539
## 196   m 16.94414 69.64531
## 197   m 17.00061 69.77523
## 198   m 17.05707 69.90515
## 199   m 17.11354 70.03507
## 200   m 17.17000 70.16499

ggplot(heightweight, aes(x = ageYear, y = heightIn, colour = sex)) + geom_point() + 
    geom_line(data = predvals)

ggplot(heightweight, aes(x = ageYear, y = heightIn, colour = sex)) + geom_point() + 
    geom_line(data = predvals) + facet_grid(. ~ sex)

5.9 向散点图添加模型参数

model <- lm(heightIn ~ ageYear, heightweight)
summary(model)

## 
## Call:
## lm(formula = heightIn ~ ageYear, data = heightweight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3517 -1.9006  0.1378  1.9071  8.3371 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.4356     1.8281   20.48   <2e-16 ***
## ageYear       1.7483     0.1329   13.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.989 on 234 degrees of freedom
## Multiple R-squared:  0.4249, Adjusted R-squared:  0.4225 
## F-statistic: 172.9 on 1 and 234 DF,  p-value: < 2.2e-16

pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point() + 
    geom_line(data = pred)
p1 <- sp + annotate("text", label = "r^2=0.42", x = 16.5, y = 52)
p2 <- sp + annotate("text", label = "r^2==0.42", parse = TRUE, x = 16.5, y = 52)
multiplot(p1, p2, cols = 2)

自动提取模型参数并创建引用这些值的公式

as.expression(italic(y) == a + b * italic(x) * "," ~ ~italic(r)^2 ~ "=" ~ r2)

## expression(italic(y) == a + b * italic(x) * "," ~ ~italic(r)^2 ~ 
##     "=" ~ r2)

eqn <- as.character(as.expression(substitute(italic(y) == a + b * italic(x) * 
    "," ~ ~italic(r)^2 ~ "=" ~ r2, list(a = format(coef(model)[1], digits = 3), 
    b = format(coef(model)[2], digits = 3), r2 = format(summary(model)$r.squared, 
        digits = 2)))))
eqn

## [1] "italic(y) == \"37.4\" + \"1.75\" * italic(x) * \",\" ~ ~italic(r)^2 ~ \"=\" ~ \"0.42\""

sp + annotate("text", label = eqn, parse = TRUE, x = Inf, y = -Inf, hjust = 1.1, 
    vjust = -0.5)

5.10 向散点图添加边际地毯

边际地毯可用于展示每个坐标轴上数据的分布情况，调用函数geom_rug()即可，也可调用position=“jitter”参数添加扰动，以减轻边际地毯的重叠程度。

p1 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point()
p2 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug()
p3 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug(position = "jitter", 
    size = 0.2)
multiplot(p1, p2, p3, cols = 2)

5.11 向散点图添加标签

sub_countries <- subset(countries, Year == 2009 & healthexp > 2000)
head(sub_countries)

##           Name Code Year      GDP laborrate healthexp infmortality
## 254    Andorra  AND 2009       NA        NA  3089.636          3.1
## 560  Australia  AUS 2009 42130.82      65.2  3867.429          4.2
## 611    Austria  AUT 2009 45555.43      60.4  5037.311          3.6
## 968    Belgium  BEL 2009 43640.20      53.5  5104.019          3.6
## 1733    Canada  CAN 2009 39599.04      67.8  4379.761          5.2
## 2702   Denmark  DNK 2009 55933.35      65.4  6272.729          3.4

sp <- ggplot(sub_countries, aes(x = healthexp, y = infmortality)) + geom_point()
sp + annotate("text", x = 4350, y = 5.4, label = "Cnada") + annotate("text", 
    x = 7400, y = 6.5, label = "USA")

sp + geom_text(aes(label = Name), size = 4)

选择性的添加一部分标签，可在geom_text()中使用数据子集

idx <- sub_countries$Name %in% c("Canada", "Ireland", "United Kingdom", "United States", 
    "New Zealand", "Iceland", "Japan", "China")
cdat <- sub_countries[idx, ]
sp + geom_text(data = cdat, aes(x = healthexp, y = infmortality - 0.15, label = Name))

5.12 绘制气泡图

调用geom_point()和scale_size_area()函数即可

p <- ggplot(cdat, aes(x = healthexp, y = infmortality, size = GDP)) + geom_point(shape = 21, 
    colour = "black", fill = "cornsilk") + geom_text(aes(label = Name), size = 2)
p1 <- p + scale_size_area(max_size = 15)
multiplot(p, p1, cols = 2)

ggplot2-第五章散点图（二）

haoyi.li

2016/12/18

5.7 添加已存在模型的拟合线

5.8 添加已存在的多个模型的拟合线

5.9 向散点图添加模型参数

5.10 向散点图添加边际地毯

5.11 向散点图添加标签

5.12 绘制气泡图

ggplot2-第五章 散点图（二）

haoyi.li

2016/12/18

5.7 添加已存在模型的拟合线

5.8 添加已存在的多个模型的拟合线

5.9 向散点图添加模型参数

5.10 向散点图添加边际地毯

5.11 向散点图添加标签

5.12 绘制气泡图

ggplot2-第五章散点图（二）