目标:熟悉 RStudio 环境,学会使用帮助文档和基础命令。
587 * (23234 + 899) / 9
## [1] 1574008
使用 example()
函数可以直接运行包作者提供的示例代码,这是学习新函数最快的方式。
example(boxplot)
##
## boxplt> ## boxplot on a formula:
## boxplt> boxplot(count ~ spray, data = InsectSprays, col = "lightgray")
##
## boxplt> # *add* notches (somewhat funny here <--> warning "notches .. outside hinges"):
## boxplt> boxplot(count ~ spray, data = InsectSprays,
## boxplt+ notch = TRUE, add = TRUE, col = "blue")
##
## boxplt> boxplot(decrease ~ treatment, data = OrchardSprays, col = "bisque",
## boxplt+ log = "y")
##
## boxplt> ## horizontal=TRUE, switching y <--> x :
## boxplt> boxplot(decrease ~ treatment, data = OrchardSprays, col = "bisque",
## boxplt+ log = "x", horizontal=TRUE)
##
## boxplt> rb <- boxplot(decrease ~ treatment, data = OrchardSprays, col = "bisque")
##
## boxplt> title("Comparing boxplot()s and non-robust mean +/- SD")
##
## boxplt> mn.t <- tapply(OrchardSprays$decrease, OrchardSprays$treatment, mean)
##
## boxplt> sd.t <- tapply(OrchardSprays$decrease, OrchardSprays$treatment, sd)
##
## boxplt> xi <- 0.3 + seq(rb$n)
##
## boxplt> points(xi, mn.t, col = "orange", pch = 18)
##
## boxplt> arrows(xi, mn.t - sd.t, xi, mn.t + sd.t,
## boxplt+ code = 3, col = "pink", angle = 75, length = .1)
##
## boxplt> ## boxplot on a matrix:
## boxplt> mat <- cbind(Uni05 = (1:100)/21, Norm = rnorm(100),
## boxplt+ `5T` = rt(100, df = 5), Gam2 = rgamma(100, shape = 2))
##
## boxplt> boxplot(mat) # directly, calling boxplot.matrix()
##
## boxplt> ## boxplot on a data frame:
## boxplt> df. <- as.data.frame(mat)
##
## boxplt> par(las = 1) # all axis labels horizontal
##
## boxplt> boxplot(df., main = "boxplot(*, horizontal = TRUE)", horizontal = TRUE)
##
## boxplt> ## Using 'at = ' and adding boxplots -- example idea by Roger Bivand :
## boxplt> boxplot(len ~ dose, data = ToothGrowth,
## boxplt+ boxwex = 0.25, at = 1:3 - 0.2,
## boxplt+ subset = supp == "VC", col = "yellow",
## boxplt+ main = "Guinea Pigs' Tooth Growth",
## boxplt+ xlab = "Vitamin C dose mg",
## boxplt+ ylab = "tooth length",
## boxplt+ xlim = c(0.5, 3.5), ylim = c(0, 35), yaxs = "i")
##
## boxplt> boxplot(len ~ dose, data = ToothGrowth, add = TRUE,
## boxplt+ boxwex = 0.25, at = 1:3 + 0.2,
## boxplt+ subset = supp == "OJ", col = "orange")
##
## boxplt> legend(2, 9, c("Ascorbic acid", "Orange juice"),
## boxplt+ fill = c("yellow", "orange"))
##
## boxplt> ## With less effort (slightly different) using factor *interaction*:
## boxplt> boxplot(len ~ dose:supp, data = ToothGrowth,
## boxplt+ boxwex = 0.5, col = c("orange", "yellow"),
## boxplt+ main = "Guinea Pigs' Tooth Growth",
## boxplt+ xlab = "Vitamin C dose mg", ylab = "tooth length",
## boxplt+ sep = ":", lex.order = TRUE, ylim = c(0, 35), yaxs = "i")
##
## boxplt> ## more examples in help(bxp)
## boxplt>
## boxplt>
## boxplt>
example(heatmap)
##
## heatmp> require(graphics); require(grDevices)
##
## heatmp> x <- as.matrix(mtcars)
##
## heatmp> rc <- rainbow(nrow(x), start = 0, end = .3)
##
## heatmp> cc <- rainbow(ncol(x), start = 0, end = .3)
##
## heatmp> hv <- heatmap(x, col = cm.colors(256), scale = "column",
## heatmp+ RowSideColors = rc, ColSideColors = cc, margins = c(5,10),
## heatmp+ xlab = "specification variables", ylab = "Car Models",
## heatmp+ main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
##
## heatmp> utils::str(hv) # the two re-ordering index vectors
## List of 4
## $ rowInd: int [1:32] 31 17 16 15 5 25 29 24 7 6 ...
## $ colInd: int [1:11] 2 9 8 11 6 5 10 7 1 4 ...
## $ Rowv : NULL
## $ Colv : NULL
##
## heatmp> ## no column dendrogram (nor reordering) at all:
## heatmp> heatmap(x, Colv = NA, col = cm.colors(256), scale = "column",
## heatmp+ RowSideColors = rc, margins = c(5,10),
## heatmp+ xlab = "specification variables", ylab = "Car Models",
## heatmp+ main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
##
## heatmp> ## Don't show:
## heatmp> ## no row dendrogram (nor reordering) at all:
## heatmp> heatmap(x, Rowv = NA, col = cm.colors(256), scale = "column",
## heatmp+ ColSideColors = cc, margins = c(5,10),
## heatmp+ xlab = "xlab", ylab = "ylab") # no main
##
## heatmp> ## End(Don't show)
## heatmp> ## "no nothing"
## heatmp> heatmap(x, Rowv = NA, Colv = NA, scale = "column",
## heatmp+ main = "heatmap(*, NA, NA) ~= image(t(x))")
##
## heatmp> ## Demonstration of the 'scale' argument:
## heatmp> ## The only change in the code is the 'scale' arg.
## heatmp> ## The only visible change is in the color scale on the heatmap
## heatmp> ## (the original data are not scaled).
## heatmp>
## heatmp> heatmap(x, col = terrain.colors(128), scale = "column",
## heatmp+ RowSideColors = rc,
## heatmp+ ColSideColors = cc,
## heatmp+ margins = c(5,10),
## heatmp+ main = "heatmap(<Mtcars data>, ..., scale = \"column\")")
##
## heatmp> heatmap(x, col = terrain.colors(128), scale = "none",
## heatmp+ RowSideColors = rc,
## heatmp+ ColSideColors = cc,
## heatmp+ margins = c(5,10),
## heatmp+ main = "heatmap(<Mtcars data>, ..., scale = \"none\")")
##
## heatmp> round(Ca <- cor(attitude), 2)
## rating complaints privileges learning raises critical advance
## rating 1.00 0.83 0.43 0.62 0.59 0.16 0.16
## complaints 0.83 1.00 0.56 0.60 0.67 0.19 0.22
## privileges 0.43 0.56 1.00 0.49 0.45 0.15 0.34
## learning 0.62 0.60 0.49 1.00 0.64 0.12 0.53
## raises 0.59 0.67 0.45 0.64 1.00 0.38 0.57
## critical 0.16 0.19 0.15 0.12 0.38 1.00 0.28
## advance 0.16 0.22 0.34 0.53 0.57 0.28 1.00
##
## heatmp> symnum(Ca) # simple graphic
## rt cm p l rs cr a
## rating 1
## complaints + 1
## privileges . . 1
## learning , . . 1
## raises . , . , 1
## critical . 1
## advance . . . 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
##
## heatmp> heatmap(Ca, symm = TRUE, margins = c(6,6)) # with reorder()
##
## heatmp> heatmap(Ca, Rowv = FALSE, symm = TRUE, margins = c(6,6)) # _NO_ reorder()
##
## heatmp> ## slightly artificial with color bar, without and with ordering:
## heatmp> cc <- rainbow(nrow(Ca))
##
## heatmp> heatmap(Ca, Rowv = FALSE, symm = TRUE, RowSideColors = cc, ColSideColors = cc,
## heatmp+ margins = c(6,6))
##
## heatmp> heatmap(Ca, symm = TRUE, RowSideColors = cc, ColSideColors = cc,
## heatmp+ margins = c(6,6))
##
## heatmp> ## For variable clustering, rather use distance based on cor():
## heatmp> symnum( cU <- cor(USJudgeRatings) )
## CO I DM DI CF DE PR F O W PH R
## CONT 1
## INTG 1
## DMNR B 1
## DILG + + 1
## CFMG + + B 1
## DECI + + B B 1
## PREP + + B B B 1
## FAMI + + B * * B 1
## ORAL * * B B * B B 1
## WRIT * + B * * B B B 1
## PHYS , , + + + + + + + 1
## RTEN * * * * * B * B B * 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
##
## heatmp> hU <- heatmap(cU, Rowv = FALSE, symm = TRUE, col = topo.colors(16),
## heatmp+ distfun = function(c) as.dist(1 - c), keep.dendro = TRUE)
##
## heatmp> ## The Correlation matrix with same reordering:
## heatmp> round(100 * cU[hU[[1]], hU[[2]]])
## CONT INTG DMNR PHYS DILG CFMG DECI RTEN ORAL WRIT PREP FAMI
## CONT 100 -13 -15 5 1 14 9 -3 -1 -4 1 -3
## INTG -13 100 96 74 87 81 80 94 91 91 88 87
## DMNR -15 96 100 79 84 81 80 94 91 89 86 84
## PHYS 5 74 79 100 81 88 87 91 89 86 85 84
## DILG 1 87 84 81 100 96 96 93 95 96 98 96
## CFMG 14 81 81 88 96 100 98 93 95 94 96 94
## DECI 9 80 80 87 96 98 100 92 95 95 96 94
## RTEN -3 94 94 91 93 93 92 100 98 97 95 94
## ORAL -1 91 91 89 95 95 95 98 100 99 98 98
## WRIT -4 91 89 86 96 94 95 97 99 100 99 99
## PREP 1 88 86 85 98 96 96 95 98 99 100 99
## FAMI -3 87 84 84 96 94 94 94 98 99 99 100
##
## heatmp> ## The column dendrogram:
## heatmp> utils::str(hU$Colv)
## --[dendrogram w/ 2 branches and 12 members at h = 1.15]
## |--leaf "CONT"
## `--[dendrogram w/ 2 branches and 11 members at h = 0.258]
## |--[dendrogram w/ 2 branches and 2 members at h = 0.0354]
## | |--leaf "INTG"
## | `--leaf "DMNR"
## `--[dendrogram w/ 2 branches and 9 members at h = 0.187]
## |--leaf "PHYS"
## `--[dendrogram w/ 2 branches and 8 members at h = 0.075]
## |--[dendrogram w/ 2 branches and 3 members at h = 0.0438]
## | |--leaf "DILG"
## | `--[dendrogram w/ 2 branches and 2 members at h = 0.0189]
## | |--leaf "CFMG"
## | `--leaf "DECI"
## `--[dendrogram w/ 2 branches and 5 members at h = 0.0584]
## |--leaf "RTEN"
## `--[dendrogram w/ 2 branches and 4 members at h = 0.0187]
## |--[dendrogram w/ 2 branches and 2 members at h = 0.00657]
## | |--leaf "ORAL"
## | `--leaf "WRIT"
## `--[dendrogram w/ 2 branches and 2 members at h = 0.0101]
## |--leaf "PREP"
## `--leaf "FAMI"
学会查文档是编程的核心能力。
# 使用 ?函数名 查看文档
# ?t.test
# 运行一个 t 检验示例
t.test(rnorm(n=10, mean=10, sd=1), rnorm(n=10, mean=0, sd=1))
##
## Welch Two Sample t-test
##
## data: rnorm(n = 10, mean = 10, sd = 1) and rnorm(n = 10, mean = 0, sd = 1)
## t = 27.62, df = 17.069, p-value = 1.312e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8.944441 10.423450
## sample estimates:
## mean of x mean of y
## 9.74743092 0.06348571
# help.start() # 运行此命令可在浏览器中打开帮助主页
目标:加载包,理解 Tidy Data 格式,进行基础的数据清洗。
packages_to_install <- c(
# 1. 核心绘图与数据处理
"tidyverse", # 包含 ggplot2, dplyr, tidyr, readr 等
"gapminder", # 教学数据集
"palmerpenguins", # 教学数据集
# 2. 辅助工具
"scales", # 调整坐标轴标签 (如 $1M)
"ggrepel", # 解决文字重叠
"cowplot", # 多图拼贴 (SCI发表级排版)
"gridExtra", # 另一种多图排列工具
# 3. 配色方案
"viridis", # 色盲友好配色
"ggsci", # 顶刊配色 (JAMA, Lancet等)
"RColorBrewer", # 经典调色板
# 4. 交互式绘图
"plotly" # 动态交互
)
# 检查哪些包尚未安装
new_packages <- packages_to_install[!(packages_to_install %in% installed.packages()[,"Package"])]
# 如果有未安装的包,则进行安装
if(length(new_packages) > 0) {
# dependencies = TRUE 是默认值,但显式指定更安全。
# 它会自动安装这些包所需的所有其他依赖包。
install.packages(new_packages, dependencies = TRUE)
cat("\n--- 所有缺失的包已安装完成 ---\n")
} else {
cat("\n--- 所有包均已安装,无需操作 ---\n")
}
##
## --- 所有包均已安装,无需操作 ---
# 1. 核心数据处理与绘图
library(tidyverse) # ggplot2, dplyr, tidyr
library(gapminder) # 教学数据集: 全球发展数据
library(palmerpenguins) # 教学数据集: 企鹅数据
# 2. 辅助工具
library(scales) # 调整坐标轴标签 (如 $1M)
library(ggrepel) # 解决文字重叠
library(cowplot) # 多图拼贴 (SCI发表级排版)
library(gridExtra) # 另一种多图排列工具
# 3. 配色方案
library(viridis) # 色盲友好配色
library(ggsci) # 顶刊配色 (JAMA, Lancet等)
library(RColorBrewer) # 经典调色板
# 4. 交互式绘图
library(plotly) # 动态交互
使用 gapminder
数据集,它记录了全球各国半个多世纪的发展轨迹。
data("gapminder")
# 查看数据结构
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
# 数据摘要
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
为了绘图方便,我们筛选 2007 年的数据,并计算总 GDP。
clean_data_2007 <- gapminder %>%
filter(year == 2007) %>%
mutate(
pop_m = pop / 1e6, # 人口转换为百万
gdp_total = gdpPercap * pop, # 计算总GDP
is_china = ifelse(country == "China", "China", "Other") # 标记中国
)
head(clean_data_2007)
使用 group_by 和 summarize
计算各大洲的汇总数据。
continent_summary <- gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(
avg_lifeExp = mean(lifeExp), # 平均预期寿命
total_pop = sum(pop), # 总人口
median_gdp = median(gdpPercap), # 中位数GDP
n_countries = n() # 国家数量
) %>%
arrange(desc(avg_lifeExp))
knitr::kable(continent_summary, caption = "2007年各大洲汇总数据")
| continent | avg_lifeExp | total_pop | median_gdp | n_countries |
|---|---|---|---|---|
| Oceania | 80.71950 | 24549947 | 29810.188 | 2 |
| Europe | 77.64860 | 586098529 | 28054.066 | 30 |
| Americas | 73.60812 | 898871184 | 8948.103 | 25 |
| Asia | 70.72848 | 3811953827 | 4471.062 | 33 |
| Africa | 54.80604 | 929539692 | 1452.267 | 52 |
目标:对比 Base R 与 ggplot2,理解 映射 (Mapping) 和 几何对象 (Geoms)。
场景:查看人均 GDP 与 预期寿命 的关系。
Base R 就像在一张纸上画图,需要手动控制每一个细节(如颜色映射、图例)。
# 使用 pch 参数将大洲映射为不同的点形状
plot(clean_data_2007$gdpPercap, clean_data_2007$lifeExp,
col = clean_data_2007$continent,
pch = as.numeric(clean_data_2007$continent),
main = "Base R Plot", xlab = "GDP", ylab = "Life Expectancy")
# 手动添加图例
legend("bottomright",
legend = levels(clean_data_2007$continent),
col = 1:length(levels(clean_data_2007$continent)),
pch = 1:length(levels(clean_data_2007$continent)),
cex = 0.7)
高级绘图命令产生一个图,低级绘图命令增加点,线,文本,图例等。
# 读取本地数据
mydata <- read.table("./dataset/Planning.csv", header = T)
head(mydata)
summary(mydata)
## id age relig ped
## Min. : 1.00 Min. :18.00 Min. :1.000 Min. :0.000
## 1st Qu.: 63.25 1st Qu.:24.00 1st Qu.:1.000 1st Qu.:2.000
## Median :125.50 Median :27.00 Median :1.000 Median :2.000
## Mean :125.64 Mean :27.43 Mean :1.108 Mean :3.283
## 3rd Qu.:187.75 3rd Qu.:30.00 3rd Qu.:1.000 3rd Qu.:5.000
## Max. :251.00 Max. :41.00 Max. :2.000 Max. :7.000
## NA's :1 NA's :24
## income am reason bps bpd
## Min. :1.000 Min. :15.00 Min. :1.000 Min. : 0 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:110 1st Qu.: 60.00
## Median :2.000 Median :20.00 Median :1.000 Median :110 Median : 70.00
## Mean :2.129 Mean :20.35 Mean :1.484 Mean :113 Mean : 71.69
## 3rd Qu.:3.000 3rd Qu.:22.00 3rd Qu.:2.000 3rd Qu.:120 3rd Qu.: 80.00
## Max. :6.000 Max. :31.00 Max. :3.000 Max. :170 Max. :110.00
## NA's :26 NA's :1 NA's :2 NA's :7 NA's :7
## wt ht
## Min. : 0.00 Min. :141.0
## 1st Qu.:46.50 1st Qu.:150.0
## Median :51.40 Median :153.0
## Mean :51.89 Mean :153.5
## 3rd Qu.:57.00 3rd Qu.:156.0
## Max. :73.80 Max. :175.0
## NA's :5 NA's :7
# 绘制箱线图
boxplot(mydata$bps ~ mydata$income, col = "tomato", main = "Income vs BPS")
# 删除缺失值
mydata.omit <- na.omit(mydata)
dim(mydata.omit)
## [1] 210 11
# 绘制散点图
plot(
mydata.omit$ht,
mydata.omit$wt,
pch = 20,
col = "rosybrown",
xlab = "Height",
ylab = "Weight",
main = "Base R: Plot + Abline + Text"
)
# 增加线性回归线
lm_line <- lm(mydata.omit$wt ~ mydata.omit$ht)
abline(lm_line)
# 增加文本
text(158, 65, "Weight=0.53*Height-29.36")
# 增加图例
legend("topright", "A legend", col = "rosybrown", pch = 20)
mydata <- read.table("./dataset/svmvsrpart10times.csv",header=T)
print(mydata)
## diags1 diags2 kappas1 kappas2 sensitivity1 sensitivity2
## 1 0.8148148 0.7870370 0.5749705 0.5083135 0.6153846 0.5641026
## 2 0.8148148 0.8240741 0.5847751 0.6032483 0.6666667 0.6666667
## 3 0.7592593 0.7314815 0.4840132 0.4212860 0.6923077 0.6410256
## 4 0.7962963 0.6574074 0.5379230 0.2776573 0.6153846 0.5897436
## 5 0.6944444 0.6666667 0.3109049 0.3157339 0.4871795 0.6666667
## 6 0.7500000 0.7685185 0.4361949 0.4955157 0.5641026 0.6666667
## 7 0.7870370 0.6851852 0.5306122 0.3100338 0.6666667 0.5384615
## 8 0.7500000 0.7314815 0.4551570 0.4212860 0.6410256 0.6410256
## 9 0.7962963 0.7500000 0.5379230 0.4295775 0.6153846 0.5384615
## 10 0.7685185 0.7500000 0.4779582 0.4551570 0.5897436 0.6410256
## specificity1 specificity2
## 1 0.9275362 0.9130435
## 2 0.8985507 0.9130435
## 3 0.7971014 0.7826087
## 4 0.8985507 0.6956522
## 5 0.8115942 0.6666667
## 6 0.8550725 0.8260870
## 7 0.8550725 0.7681159
## 8 0.8115942 0.7826087
## 9 0.8985507 0.8695652
## 10 0.8695652 0.8115942
plot(mydata$diags1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Accuracy")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$diags1,col="blue")
points(mydata$diags2,col="red",pch=17,cex=0.8)
lines(mydata$diags2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)
par(mfrow=c(2,2))
plot(mydata$diags1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Accuracy")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$diags1,col="blue")
points(mydata$diags2,col="red",pch=17,cex=0.8)
lines(mydata$diags2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)
plot(mydata$kappas1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Kappa")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$kappas1,col="blue")
points(mydata$kappas2,col="red",pch=17,cex=0.8)
lines(mydata$kappas2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)
plot(mydata$sensitivity1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Sensitivity")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$sensitivity1,col="blue")
points(mydata$sensitivity2,col="red",pch=17,cex=0.8)
lines(mydata$sensitivity2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)
plot(mydata$specificity1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Specificity")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$specificity1,col="blue")
points(mydata$specificity2,col="red",pch=17,cex=0.8)
lines(mydata$specificity2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)
par(mfrow=c(1,1))
ggplot2
基于核心理念:Data + Aes (美学映射) + Geom (几何对象)。
geom_point)、线
(geom_line)、柱 (geom_bar) 等。p1 <- ggplot(data = clean_data_2007,
mapping = aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
labs(title = "ggplot2 Basic Plot")
p1
场景:比较不同大洲的预期寿命分布。
# Base R
boxplot(lifeExp ~ continent, data = clean_data_2007,
main = "Base R Boxplot", col = "lightblue")
# ggplot2
p_box <- ggplot(clean_data_2007, aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot() +
labs(title = "ggplot2 Boxplot") +
theme_minimal() +
theme(legend.position = "none")
p_box
使用 penguins
数据集展示更丰富的映射(颜色、形状、大小)。
# 基础映射
ggplot(data = penguins, mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point() +
labs(title = "Penguins: Color Mapping")
# 复杂映射 (颜色+形状+大小)
ggplot(data = penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = species, size = body_mass_g)) +
geom_point(alpha = 0.6) +
labs(title = "Penguins: Complex Mapping")
# 添加平滑曲线
ggplot(data = penguins, mapping = aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(alpha = 0.6) +
geom_smooth() +
labs(title = "Penguins: With Smoothing Line")
分面是将数据按照某个变量拆分成多个子图,是比较组间差异的神器。
# 按物种分面
ggplot(data = penguins, mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point() +
facet_wrap(~species) +
labs(title = "Facet by Species")
# 另一种分面示例
ggplot(data = penguins, mapping = aes(x = species, y = body_mass_g, color = species)) +
geom_boxplot() +
facet_wrap(~island) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Facet by Island")
展示为何可视化很重要(统计指标相同,但图形完全不同)。
# 读取本地数据
mydata <- read.table("./dataset/DatasaurusDozen.tsv", header=T, stringsAsFactors=F)
head(mydata)
# 数据摘要对比
table(mydata$dataset)
##
## away bullseye circle dino dots h_lines high_lines
## 142 142 142 142 142 142 142
## slant_down slant_up star v_lines wide_lines x_shape
## 142 142 142 142 142 142
summary(subset(mydata, dataset=="away"))
## dataset x y
## Length:142 Min. :15.56 Min. : 0.01512
## Class :character 1st Qu.:39.72 1st Qu.:24.62589
## Mode :character Median :53.34 Median :47.53527
## Mean :54.27 Mean :47.83472
## 3rd Qu.:69.15 3rd Qu.:71.80315
## Max. :91.64 Max. :97.47577
summary(subset(mydata, dataset=="dino"))
## dataset x y
## Length:142 Min. :22.31 Min. : 2.949
## Class :character 1st Qu.:44.10 1st Qu.:25.288
## Mode :character Median :53.33 Median :46.026
## Mean :54.26 Mean :47.832
## 3rd Qu.:64.74 3rd Qu.:68.526
## Max. :98.21 Max. :99.487
# 绘图对比
ggplot(mydata, aes(dataset,y, color=dataset))+geom_boxplot()
ggplot(mydata, aes(dataset,y, color=dataset))+geom_violin()
# 散点图
ggplot(mydata, aes(x,y, color=dataset))+geom_point()
# 分面展示
p <- ggplot(mydata, aes(x,y, color=dataset)) +
geom_point() +
facet_wrap(~dataset, nrow=3) +
scale_color_manual(values=c(brewer.pal(12,"Paired"), "black")) +
theme(legend.position="none")
print(p)
场景:比较各洲的平均寿命。
# ggplot2: 使用 geom_col 并配合 reorder 进行排序
p_bar <- ggplot(continent_summary, aes(x = reorder(continent, -avg_lifeExp), y = avg_lifeExp)) +
geom_col(fill = "steelblue") +
labs(title = "ggplot2 Barplot", x = "Continent", y = "Avg Life Exp") +
theme_minimal()
p_bar
目标:使用 Scale, Theme, ggsci 提升图表质感。
为后续美化做准备,绘制一个包含 4 个维度的复杂散点图。
p2 <- ggplot(data = clean_data_2007,
mapping = aes(x = gdpPercap,
y = lifeExp,
color = continent,
size = pop_m)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE, color = "grey50")
p2
对 GDP 进行对数变换 (scale_x_log10),并使用色盲友好的
viridis 配色。
p3 <- p2 +
scale_x_log10(labels = dollar_format()) +
scale_size_continuous(range = c(2, 12), guide = "none") +
scale_color_viridis(discrete = TRUE, option = "D") + # 默认值,从亮黄色到绿色、再到深蓝色/紫色。
labs(
title = "全球经济与健康状况 (2007)",
subtitle = "人均GDP与预期寿命呈显著正相关",
x = "人均 GDP (对数坐标 USD)",
y = "预期寿命 (岁)",
color = "大洲",
caption = "数据来源: Gapminder Project"
)
p3
定义一个统一的 Theme 对象,应用到所有图表中。
my_journal_theme <- theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 18, color = "#2c3e50"),
plot.subtitle = element_text(face = "italic", color = "#7f8c8d", margin = margin(b = 15)),
legend.position = "top",
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text = element_text(color = "black"),
axis.line = element_line(color = "black")
)
p4 <- p3 + my_journal_theme
p4
使用 ggsci 包一键调用顶级医学期刊的配色。
# JAMA 风格
p_jama <- p2 +
scale_x_log10(labels = dollar_format()) +
scale_size_continuous(range = c(2, 12), guide = "none") +
scale_color_jama() +
labs(title = "JAMA Style", subtitle = "ggsci::scale_color_jama") +
theme_minimal(base_size = 12) + theme(legend.position = "top")
# Lancet 风格
p_lancet <- p2 +
scale_x_log10(labels = dollar_format()) +
scale_size_continuous(range = c(2, 12), guide = "none") +
scale_color_lancet() +
labs(title = "Lancet Style", subtitle = "ggsci::scale_color_lancet") +
theme_minimal(base_size = 12) + theme(legend.position = "top")
# 并排展示
library(cowplot)
plot_grid(p_jama, p_lancet, labels = "AUTO")
展示如何将 grid.arrange 和 ggsci
结合使用。
data("diamonds")
# 子图1:散点图
sub_p1 <- ggplot(subset(diamonds, carat >= 2.2),
aes(x = table, y = price, colour = cut)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "loess", alpha = 0.05, linewidth = 1, span = 1) +
theme_bw() +
labs(title = "Price vs Table")
# 子图2:直方图
sub_p2 <- ggplot(subset(diamonds, carat > 2.2 & depth > 55 & depth < 70),
aes(x = depth, fill = cut)) +
geom_histogram(colour = "black", binwidth = 1, position = "dodge") +
theme_bw() +
labs(title = "Depth Distribution")
# 组合:应用 NPG (Nature Publishing Group) 配色
grid.arrange(
sub_p1 + scale_color_npg(),
sub_p2 + scale_fill_npg(),
ncol = 2,
top = "Nature Publishing Group Style (ggsci)"
)
library(ggplot2)
library(cowplot)
# 箱线图 (Boxplot)
bxp <- ggplot(ToothGrowth, aes(x = factor(dose), y = len, color = factor(dose))) +
geom_boxplot() +
scale_color_jco() +
labs(x = "Dose", y = "Length", color = "Dose") # 添加标签
bxp
dp <- ggplot(ToothGrowth, aes(x = factor(dose), y = len, fill = factor(dose))) +
# 注意:geom_dotplot 需要 stackgroups=TRUE 和 binaxis="y"
# binwidth = 1 对应原始函数中的参数
geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 1.1) +
scale_fill_jco() +
labs(x = "Dose", y = "Length", fill = "Dose")
dp
data("mtcars")
mtcars$name <- rownames(mtcars)
mtcars$cyl <- as.factor(mtcars$cyl)
# 1. 数据预处理:实现分组排序 (sort.by.groups = TRUE, sort.val = "asc")
# 找出每个 name (车型) 在其 cyl 组内的排序位置
mtcars_sorted <- mtcars %>%
group_by(cyl) %>%
arrange(mpg) %>%
mutate(name_rank = row_number()) %>%
ungroup() %>%
# 使用 forcats 包的 fct_reorder2 或手动创建排序因子
# 为了模仿 ggpubr 的分组排序效果,这里按 cyl 和 mpg 的升序创建因子水平
arrange(cyl, mpg) %>%
mutate(name = factor(name, levels = unique(name)))
# 2. 绘制棒状图
bp <- ggplot(mtcars_sorted, aes(x = name, y = mpg, fill = cyl)) +
# 绘制柱状图,stat="identity" 表示 y 值直接取数据中的 mpg
# color="white" 用于设置柱子边框颜色
geom_bar(stat = "identity", color = "white") +
# 配色方案
scale_fill_jco() +
# 调整主题:旋转 x 轴文本
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "right"
) +
labs(x = "Car Model", y = "Miles Per Gallon (MPG)", fill = "Cylinders")
bp
library("cowplot")
plot_grid(bxp, dp, bp + theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()),
labels = c("A", "B", "C"),
ncol = 2, nrow = 2)
目标:突出重点,制作交互式网页图表。
在所有灰色的背景点中,高亮中国、美国等重点国家,引导读者视线。
labels_data <- clean_data_2007 %>%
filter(country %in% c("China", "United States", "India", "Nigeria"))
p_story <- ggplot(clean_data_2007, aes(x = gdpPercap, y = lifeExp)) +
# 背景层:灰色
geom_point(aes(size = pop_m), color = "grey85") +
# 高亮层:彩色
geom_point(data = labels_data, aes(size = pop_m, color = continent), alpha = 0.9) +
# 标签层:避免重叠
geom_text_repel(data = labels_data, aes(label = country), box.padding = 0.5) +
scale_x_log10(labels = dollar_format()) +
scale_size(range = c(2, 12), guide = "none") +
scale_color_brewer(palette = "Set1") +
my_journal_theme +
labs(title = "发展中大国的崛起", subtitle = "重点关注:中国 vs 美国")
p_story
将静态图表转换为 HTML 交互组件。试着将鼠标悬停在点上。
library(plotly)
interactive_plot <- ggplotly(p4, tooltip = c("x", "y", "color", "size"))
interactive_plot
重现 Hans Rosling 的经典气泡图动画。点击图表下方的 Play 按钮。
p_motion <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp,
size = pop, color = continent,
frame = year, # 核心:动画帧
text = country)) +
geom_point(alpha = 0.7) +
scale_x_log10(labels = dollar_format()) +
scale_size(range = c(2, 20), guide = "none") +
scale_color_viridis(discrete = TRUE, option = "D") +
theme_minimal() +
labs(title = "全球发展动态轨迹 (1952-2007)", x = "人均 GDP", y = "预期寿命")
ggplotly(p_motion, tooltip = "text") %>%
animation_opts(frame = 1000, easing = "linear", redraw = FALSE) %>%
animation_button(x = 1, xanchor = "right", y = 0, yanchor = "bottom")