# Multiple plot function ggplot objects can be passed in ..., or to plotlist
# (as a list of ggplot objects) - cols: Number of columns in layout -
# layout: A matrix specifying the layout. If present, 'cols' is ignored. If
# the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), then
# plot 1 will go in the upper left, 2 will go in the upper right, and 3 will
# go all the way across the bottom.
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel ncol: Number of columns of plots nrow: Number of rows
# needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), ncol = cols,
nrow = ceiling(numPlots/cols))
}
if (numPlots == 1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col))
}
}
}
# multiplot(p1, p2, p3, p4, cols=2) gridExtra
# install.packages('gcookbook')
library(ggplot2)
library(gcookbook)
library(plyr)
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn))
sp + geom_point() + stat_smooth(method = lm)
sp + geom_point() + stat_smooth(method = lm, se = FALSE)
sp + geom_point() + stat_smooth(method = lm, colour = "red", size = 1, level = 0.99)
sp + geom_point() + stat_smooth(method = lm, colour = "red", size = 1, level = 0.99,
fill = "blue", alpha = 0.5)
sp + geom_point() + stat_smooth(method = loess)
拟合逻辑回归
MASS包中的数据集biopsy,包含9个与乳腺癌活检组织相关的指标以及肿瘤的分类,包括良性(benign)和恶性(malignant)
数据点重叠严重,需要向数据点添加一些扰动,将数据点设置为半透明、点形设置为空心圆,并使用略小的额数据点
library(MASS) # for dataset biopsy
b <- biopsy
b$classn[b$class == "benign"] <- 0
b$classn[b$class == "malignant"] <- 1
head(b)
## ID V1 V2 V3 V4 V5 V6 V7 V8 V9 class classn
## 1 1000025 5 1 1 1 2 1 3 1 1 benign 0
## 2 1002945 5 4 4 5 7 10 3 2 1 benign 0
## 3 1015425 3 1 1 1 2 2 3 1 1 benign 0
## 4 1016277 6 8 8 1 3 4 3 7 1 benign 0
## 5 1017023 4 1 1 3 2 1 3 1 1 benign 0
## 6 1017122 8 10 10 8 7 10 9 7 1 malignant 1
ggplot(b, aes(x = V1, y = classn)) + geom_point(position = position_jitter(width = 0.3,
height = 0.06), alpha = 0.4, shape = 21, size = 1.5) + stat_smooth(method = glm,
method.args = list(family = binomial))
library(gcookbook)
model <- lm(heightIn ~ ageYear + I(ageYear^2), heightweight)
model
##
## Call:
## lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)
##
## Coefficients:
## (Intercept) ageYear I(ageYear^2)
## -10.3136 8.6673 -0.2478
根据模型计算预测值:
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear = seq(xmin, xmax, length.out = 100))
predicted$heightIn <- predict(model, predicted)
head(predicted)
## ageYear heightIn
## 1 11.58000 56.82624
## 2 11.63980 57.00047
## 3 11.69960 57.17294
## 4 11.75939 57.34363
## 5 11.81919 57.51255
## 6 11.87899 57.67969
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point(colour = "grey60")
sp + geom_line(data = predicted, size = 1)
predictvals <- function(model, xvar, yvar, xrange = NULL, samples = 100, ...) {
if (is.null(xrange)) {
if (any(class(model) %in% c("lm", "glm")))
xrange <- range(model$model[[xvar]]) else if (any(class(model) %in% "loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x = seq(xrange[1], xrange[2], length.out = samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model, newdata = newdata, ...)
newdata
}
modlinear <- lm(heightIn ~ ageYear, heightweight)
modloess <- loess(heightIn ~ ageYear, heightweight)
lm_predicted <- predictvals(modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals(modloess, "ageYear", "heightIn")
sp + geom_line(data = lm_predicted, colour = "red", size = 0.8) + geom_line(data = loess_predicted,
colour = "blue", size = 0.8)
make_model <- function(data) {
lm(heightIn ~ ageYear, data)
}
library(plyr)
models <- dlply(heightweight, "sex", .fun = make_model)
models
## $f
##
## Call:
## lm(formula = heightIn ~ ageYear, data = data)
##
## Coefficients:
## (Intercept) ageYear
## 43.963 1.209
##
##
## $m
##
## Call:
## lm(formula = heightIn ~ ageYear, data = data)
##
## Coefficients:
## (Intercept) ageYear
## 30.658 2.301
##
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## sex
## 1 f
## 2 m
predvals <- ldply(models, .fun = predictvals, xvar = "ageYear", "heightIn")
head(predvals)
## sex ageYear heightIn
## 1 f 11.58000 57.96250
## 2 f 11.63980 58.03478
## 3 f 11.69960 58.10707
## 4 f 11.75939 58.17936
## 5 f 11.81919 58.25165
## 6 f 11.87899 58.32394
tail(predvals)
## sex ageYear heightIn
## 195 m 16.88768 69.51539
## 196 m 16.94414 69.64531
## 197 m 17.00061 69.77523
## 198 m 17.05707 69.90515
## 199 m 17.11354 70.03507
## 200 m 17.17000 70.16499
ggplot(heightweight, aes(x = ageYear, y = heightIn, colour = sex)) + geom_point() +
geom_line(data = predvals)
ggplot(heightweight, aes(x = ageYear, y = heightIn, colour = sex)) + geom_point() +
geom_line(data = predvals) + facet_grid(. ~ sex)
model <- lm(heightIn ~ ageYear, heightweight)
summary(model)
##
## Call:
## lm(formula = heightIn ~ ageYear, data = heightweight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3517 -1.9006 0.1378 1.9071 8.3371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.4356 1.8281 20.48 <2e-16 ***
## ageYear 1.7483 0.1329 13.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.989 on 234 degrees of freedom
## Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
## F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
pred <- predictvals(model, "ageYear", "heightIn")
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point() +
geom_line(data = pred)
p1 <- sp + annotate("text", label = "r^2=0.42", x = 16.5, y = 52)
p2 <- sp + annotate("text", label = "r^2==0.42", parse = TRUE, x = 16.5, y = 52)
multiplot(p1, p2, cols = 2)
自动提取模型参数并创建引用这些值的公式
as.expression(italic(y) == a + b * italic(x) * "," ~ ~italic(r)^2 ~ "=" ~ r2)
## expression(italic(y) == a + b * italic(x) * "," ~ ~italic(r)^2 ~
## "=" ~ r2)
eqn <- as.character(as.expression(substitute(italic(y) == a + b * italic(x) *
"," ~ ~italic(r)^2 ~ "=" ~ r2, list(a = format(coef(model)[1], digits = 3),
b = format(coef(model)[2], digits = 3), r2 = format(summary(model)$r.squared,
digits = 2)))))
eqn
## [1] "italic(y) == \"37.4\" + \"1.75\" * italic(x) * \",\" ~ ~italic(r)^2 ~ \"=\" ~ \"0.42\""
sp + annotate("text", label = eqn, parse = TRUE, x = Inf, y = -Inf, hjust = 1.1,
vjust = -0.5)
边际地毯可用于展示每个坐标轴上数据的分布情况,调用函数geom_rug()即可,也可调用position=“jitter”参数添加扰动,以减轻边际地毯的重叠程度。
p1 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point()
p2 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug()
p3 <- ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug(position = "jitter",
size = 0.2)
multiplot(p1, p2, p3, cols = 2)
sub_countries <- subset(countries, Year == 2009 & healthexp > 2000)
head(sub_countries)
## Name Code Year GDP laborrate healthexp infmortality
## 254 Andorra AND 2009 NA NA 3089.636 3.1
## 560 Australia AUS 2009 42130.82 65.2 3867.429 4.2
## 611 Austria AUT 2009 45555.43 60.4 5037.311 3.6
## 968 Belgium BEL 2009 43640.20 53.5 5104.019 3.6
## 1733 Canada CAN 2009 39599.04 67.8 4379.761 5.2
## 2702 Denmark DNK 2009 55933.35 65.4 6272.729 3.4
sp <- ggplot(sub_countries, aes(x = healthexp, y = infmortality)) + geom_point()
sp + annotate("text", x = 4350, y = 5.4, label = "Cnada") + annotate("text",
x = 7400, y = 6.5, label = "USA")
sp + geom_text(aes(label = Name), size = 4)
选择性的添加一部分标签,可在geom_text()中使用数据子集
idx <- sub_countries$Name %in% c("Canada", "Ireland", "United Kingdom", "United States",
"New Zealand", "Iceland", "Japan", "China")
cdat <- sub_countries[idx, ]
sp + geom_text(data = cdat, aes(x = healthexp, y = infmortality - 0.15, label = Name))
调用geom_point()和scale_size_area()函数即可
p <- ggplot(cdat, aes(x = healthexp, y = infmortality, size = GDP)) + geom_point(shape = 21,
colour = "black", fill = "cornsilk") + geom_text(aes(label = Name), size = 2)
p1 <- p + scale_size_area(max_size = 15)
multiplot(p, p1, cols = 2)