模块零:R语言快速预热

目标:熟悉 RStudio 环境,学会使用帮助文档和基础命令。

0.1 把 R 当作计算器

587 * (23234 + 899) / 9
## [1] 1574008

0.2 善用 Example 快速学习

使用 example() 函数可以直接运行包作者提供的示例代码,这是学习新函数最快的方式。

点击查看 example(boxplot) 运行结果
  example(boxplot)
  ## 
  ## boxplt> ## boxplot on a formula:
  ## boxplt> boxplot(count ~ spray, data = InsectSprays, col = "lightgray")
  ## 
  ## boxplt> # *add* notches (somewhat funny here <--> warning "notches .. outside hinges"):
  ## boxplt> boxplot(count ~ spray, data = InsectSprays,
  ## boxplt+         notch = TRUE, add = TRUE, col = "blue")

  ## 
  ## boxplt> boxplot(decrease ~ treatment, data = OrchardSprays, col = "bisque",
  ## boxplt+         log = "y")

  ## 
  ## boxplt> ## horizontal=TRUE, switching  y <--> x :
  ## boxplt> boxplot(decrease ~ treatment, data = OrchardSprays, col = "bisque",
  ## boxplt+         log = "x", horizontal=TRUE)

  ## 
  ## boxplt> rb <- boxplot(decrease ~ treatment, data = OrchardSprays, col = "bisque")

  ## 
  ## boxplt> title("Comparing boxplot()s and non-robust mean +/- SD")
  ## 
  ## boxplt> mn.t <- tapply(OrchardSprays$decrease, OrchardSprays$treatment, mean)
  ## 
  ## boxplt> sd.t <- tapply(OrchardSprays$decrease, OrchardSprays$treatment, sd)
  ## 
  ## boxplt> xi <- 0.3 + seq(rb$n)
  ## 
  ## boxplt> points(xi, mn.t, col = "orange", pch = 18)
  ## 
  ## boxplt> arrows(xi, mn.t - sd.t, xi, mn.t + sd.t,
  ## boxplt+        code = 3, col = "pink", angle = 75, length = .1)
  ## 
  ## boxplt> ## boxplot on a matrix:
  ## boxplt> mat <- cbind(Uni05 = (1:100)/21, Norm = rnorm(100),
  ## boxplt+              `5T` = rt(100, df = 5), Gam2 = rgamma(100, shape = 2))
  ## 
  ## boxplt> boxplot(mat) # directly, calling boxplot.matrix()

  ## 
  ## boxplt> ## boxplot on a data frame:
  ## boxplt> df. <- as.data.frame(mat)
  ## 
  ## boxplt> par(las = 1) # all axis labels horizontal
  ## 
  ## boxplt> boxplot(df., main = "boxplot(*, horizontal = TRUE)", horizontal = TRUE)

  ## 
  ## boxplt> ## Using 'at = ' and adding boxplots -- example idea by Roger Bivand :
  ## boxplt> boxplot(len ~ dose, data = ToothGrowth,
  ## boxplt+         boxwex = 0.25, at = 1:3 - 0.2,
  ## boxplt+         subset = supp == "VC", col = "yellow",
  ## boxplt+         main = "Guinea Pigs' Tooth Growth",
  ## boxplt+         xlab = "Vitamin C dose mg",
  ## boxplt+         ylab = "tooth length",
  ## boxplt+         xlim = c(0.5, 3.5), ylim = c(0, 35), yaxs = "i")

  ## 
  ## boxplt> boxplot(len ~ dose, data = ToothGrowth, add = TRUE,
  ## boxplt+         boxwex = 0.25, at = 1:3 + 0.2,
  ## boxplt+         subset = supp == "OJ", col = "orange")
  ## 
  ## boxplt> legend(2, 9, c("Ascorbic acid", "Orange juice"),
  ## boxplt+        fill = c("yellow", "orange"))
  ## 
  ## boxplt> ## With less effort (slightly different) using factor *interaction*:
  ## boxplt> boxplot(len ~ dose:supp, data = ToothGrowth,
  ## boxplt+         boxwex = 0.5, col = c("orange", "yellow"),
  ## boxplt+         main = "Guinea Pigs' Tooth Growth",
  ## boxplt+         xlab = "Vitamin C dose mg", ylab = "tooth length",
  ## boxplt+         sep = ":", lex.order = TRUE, ylim = c(0, 35), yaxs = "i")

  ## 
  ## boxplt> ## more examples in  help(bxp)
  ## boxplt> 
  ## boxplt> 
  ## boxplt>
点击查看 example(heatmap) 运行结果
  example(heatmap)
  ## 
  ## heatmp> require(graphics); require(grDevices)
  ## 
  ## heatmp> x  <- as.matrix(mtcars)
  ## 
  ## heatmp> rc <- rainbow(nrow(x), start = 0, end = .3)
  ## 
  ## heatmp> cc <- rainbow(ncol(x), start = 0, end = .3)
  ## 
  ## heatmp> hv <- heatmap(x, col = cm.colors(256), scale = "column",
  ## heatmp+               RowSideColors = rc, ColSideColors = cc, margins = c(5,10),
  ## heatmp+               xlab = "specification variables", ylab =  "Car Models",
  ## heatmp+               main = "heatmap(<Mtcars data>, ..., scale = \"column\")")

  ## 
  ## heatmp> utils::str(hv) # the two re-ordering index vectors
  ## List of 4
  ##  $ rowInd: int [1:32] 31 17 16 15 5 25 29 24 7 6 ...
  ##  $ colInd: int [1:11] 2 9 8 11 6 5 10 7 1 4 ...
  ##  $ Rowv  : NULL
  ##  $ Colv  : NULL
  ## 
  ## heatmp> ## no column dendrogram (nor reordering) at all:
  ## heatmp> heatmap(x, Colv = NA, col = cm.colors(256), scale = "column",
  ## heatmp+         RowSideColors = rc, margins = c(5,10),
  ## heatmp+         xlab = "specification variables", ylab =  "Car Models",
  ## heatmp+         main = "heatmap(<Mtcars data>, ..., scale = \"column\")")

  ## 
  ## heatmp> ## Don't show: 
  ## heatmp> ## no row dendrogram (nor reordering) at all:
  ## heatmp> heatmap(x, Rowv = NA, col = cm.colors(256), scale = "column",
  ## heatmp+         ColSideColors = cc, margins = c(5,10),
  ## heatmp+         xlab = "xlab", ylab =  "ylab")  # no main

  ## 
  ## heatmp> ## End(Don't show)
  ## heatmp> ## "no nothing"
  ## heatmp> heatmap(x, Rowv = NA, Colv = NA, scale = "column",
  ## heatmp+         main = "heatmap(*, NA, NA) ~= image(t(x))")

  ## 
  ## heatmp> ## Demonstration of the 'scale' argument:
  ## heatmp> ## The only change in the code is the 'scale' arg.
  ## heatmp> ## The only visible change is in the color scale on the heatmap
  ## heatmp> ## (the original data are not scaled).
  ## heatmp> 
  ## heatmp> heatmap(x, col = terrain.colors(128), scale = "column",
  ## heatmp+         RowSideColors = rc,
  ## heatmp+         ColSideColors = cc,
  ## heatmp+         margins = c(5,10),
  ## heatmp+         main = "heatmap(<Mtcars data>, ..., scale = \"column\")")

  ## 
  ## heatmp> heatmap(x, col = terrain.colors(128), scale = "none",
  ## heatmp+         RowSideColors = rc,
  ## heatmp+         ColSideColors = cc,
  ## heatmp+         margins = c(5,10),
  ## heatmp+         main = "heatmap(<Mtcars data>, ..., scale = \"none\")")

  ## 
  ## heatmp> round(Ca <- cor(attitude), 2)
  ##            rating complaints privileges learning raises critical advance
  ## rating       1.00       0.83       0.43     0.62   0.59     0.16    0.16
  ## complaints   0.83       1.00       0.56     0.60   0.67     0.19    0.22
  ## privileges   0.43       0.56       1.00     0.49   0.45     0.15    0.34
  ## learning     0.62       0.60       0.49     1.00   0.64     0.12    0.53
  ## raises       0.59       0.67       0.45     0.64   1.00     0.38    0.57
  ## critical     0.16       0.19       0.15     0.12   0.38     1.00    0.28
  ## advance      0.16       0.22       0.34     0.53   0.57     0.28    1.00
  ## 
  ## heatmp> symnum(Ca) # simple graphic
  ##            rt cm p l rs cr a
  ## rating     1                
  ## complaints +  1             
  ## privileges .  .  1          
  ## learning   ,  .  . 1        
  ## raises     .  ,  . , 1      
  ## critical             .  1   
  ## advance          . . .     1
  ## attr(,"legend")
  ## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
  ## 
  ## heatmp> heatmap(Ca,               symm = TRUE, margins = c(6,6)) # with reorder()

  ## 
  ## heatmp> heatmap(Ca, Rowv = FALSE, symm = TRUE, margins = c(6,6)) # _NO_ reorder()

  ## 
  ## heatmp> ## slightly artificial with color bar, without and with ordering:
  ## heatmp> cc <- rainbow(nrow(Ca))
  ## 
  ## heatmp> heatmap(Ca, Rowv = FALSE, symm = TRUE, RowSideColors = cc, ColSideColors = cc,
  ## heatmp+    margins = c(6,6))

  ## 
  ## heatmp> heatmap(Ca,        symm = TRUE, RowSideColors = cc, ColSideColors = cc,
  ## heatmp+    margins = c(6,6))

  ## 
  ## heatmp> ## For variable clustering, rather use distance based on cor():
  ## heatmp> symnum( cU <- cor(USJudgeRatings) )
  ##      CO I DM DI CF DE PR F O W PH R
  ## CONT 1                             
  ## INTG    1                          
  ## DMNR    B 1                        
  ## DILG    + +  1                     
  ## CFMG    + +  B  1                  
  ## DECI    + +  B  B  1               
  ## PREP    + +  B  B  B  1            
  ## FAMI    + +  B  *  *  B  1         
  ## ORAL    * *  B  B  *  B  B 1       
  ## WRIT    * +  B  *  *  B  B B 1     
  ## PHYS    , ,  +  +  +  +  + + + 1   
  ## RTEN    * *  *  *  *  B  * B B *  1
  ## attr(,"legend")
  ## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
  ## 
  ## heatmp> hU <- heatmap(cU, Rowv = FALSE, symm = TRUE, col = topo.colors(16),
  ## heatmp+              distfun = function(c) as.dist(1 - c), keep.dendro = TRUE)

  ## 
  ## heatmp> ## The Correlation matrix with same reordering:
  ## heatmp> round(100 * cU[hU[[1]], hU[[2]]])
  ##      CONT INTG DMNR PHYS DILG CFMG DECI RTEN ORAL WRIT PREP FAMI
  ## CONT  100  -13  -15    5    1   14    9   -3   -1   -4    1   -3
  ## INTG  -13  100   96   74   87   81   80   94   91   91   88   87
  ## DMNR  -15   96  100   79   84   81   80   94   91   89   86   84
  ## PHYS    5   74   79  100   81   88   87   91   89   86   85   84
  ## DILG    1   87   84   81  100   96   96   93   95   96   98   96
  ## CFMG   14   81   81   88   96  100   98   93   95   94   96   94
  ## DECI    9   80   80   87   96   98  100   92   95   95   96   94
  ## RTEN   -3   94   94   91   93   93   92  100   98   97   95   94
  ## ORAL   -1   91   91   89   95   95   95   98  100   99   98   98
  ## WRIT   -4   91   89   86   96   94   95   97   99  100   99   99
  ## PREP    1   88   86   85   98   96   96   95   98   99  100   99
  ## FAMI   -3   87   84   84   96   94   94   94   98   99   99  100
  ## 
  ## heatmp> ## The column dendrogram:
  ## heatmp> utils::str(hU$Colv)
  ## --[dendrogram w/ 2 branches and 12 members at h = 1.15]
  ##   |--leaf "CONT" 
  ##   `--[dendrogram w/ 2 branches and 11 members at h = 0.258]
  ##      |--[dendrogram w/ 2 branches and 2 members at h = 0.0354]
  ##      |  |--leaf "INTG" 
  ##      |  `--leaf "DMNR" 
  ##      `--[dendrogram w/ 2 branches and 9 members at h = 0.187]
  ##         |--leaf "PHYS" 
  ##         `--[dendrogram w/ 2 branches and 8 members at h = 0.075]
  ##            |--[dendrogram w/ 2 branches and 3 members at h = 0.0438]
  ##            |  |--leaf "DILG" 
  ##            |  `--[dendrogram w/ 2 branches and 2 members at h = 0.0189]
  ##            |     |--leaf "CFMG" 
  ##            |     `--leaf "DECI" 
  ##            `--[dendrogram w/ 2 branches and 5 members at h = 0.0584]
  ##               |--leaf "RTEN" 
  ##               `--[dendrogram w/ 2 branches and 4 members at h = 0.0187]
  ##                  |--[dendrogram w/ 2 branches and 2 members at h = 0.00657]
  ##                  |  |--leaf "ORAL" 
  ##                  |  `--leaf "WRIT" 
  ##                  `--[dendrogram w/ 2 branches and 2 members at h = 0.0101]
  ##                     |--leaf "PREP" 
  ##                     `--leaf "FAMI"

0.3 查阅帮助文档

学会查文档是编程的核心能力。

# 使用 ?函数名 查看文档
# ?t.test 

# 运行一个 t 检验示例
t.test(rnorm(n=10, mean=10, sd=1), rnorm(n=10, mean=0, sd=1))
## 
##  Welch Two Sample t-test
## 
## data:  rnorm(n = 10, mean = 10, sd = 1) and rnorm(n = 10, mean = 0, sd = 1)
## t = 27.62, df = 17.069, p-value = 1.312e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   8.944441 10.423450
## sample estimates:
##  mean of x  mean of y 
## 9.74743092 0.06348571
# help.start() # 运行此命令可在浏览器中打开帮助主页

模块一:环境准备与数据思维

目标:加载包,理解 Tidy Data 格式,进行基础的数据清洗。

1.1 加载必要的库

packages_to_install <- c(
  # 1. 核心绘图与数据处理
  "tidyverse",      # 包含 ggplot2, dplyr, tidyr, readr 等
  "gapminder",      # 教学数据集
  "palmerpenguins", # 教学数据集
  
  # 2. 辅助工具
  "scales",     # 调整坐标轴标签 (如 $1M)
  "ggrepel",    # 解决文字重叠
  "cowplot",    # 多图拼贴 (SCI发表级排版)
  "gridExtra",  # 另一种多图排列工具
  
  # 3. 配色方案
  "viridis",    # 色盲友好配色
  "ggsci",      # 顶刊配色 (JAMA, Lancet等)
  "RColorBrewer", # 经典调色板
  
  # 4. 交互式绘图
  "plotly"      # 动态交互
)

# 检查哪些包尚未安装
new_packages <- packages_to_install[!(packages_to_install %in% installed.packages()[,"Package"])]

# 如果有未安装的包,则进行安装
if(length(new_packages) > 0) {
  # dependencies = TRUE 是默认值,但显式指定更安全。
  # 它会自动安装这些包所需的所有其他依赖包。
  install.packages(new_packages, dependencies = TRUE)
  cat("\n--- 所有缺失的包已安装完成 ---\n")
} else {
  cat("\n--- 所有包均已安装,无需操作 ---\n")
}
## 
## --- 所有包均已安装,无需操作 ---
# 1. 核心数据处理与绘图
library(tidyverse)      # ggplot2, dplyr, tidyr
library(gapminder)      # 教学数据集: 全球发展数据
library(palmerpenguins) # 教学数据集: 企鹅数据

# 2. 辅助工具
library(scales)     # 调整坐标轴标签 (如 $1M)
library(ggrepel)    # 解决文字重叠
library(cowplot)    # 多图拼贴 (SCI发表级排版)
library(gridExtra)  # 另一种多图排列工具

# 3. 配色方案
library(viridis)    # 色盲友好配色
library(ggsci)      # 顶刊配色 (JAMA, Lancet等)
library(RColorBrewer) # 经典调色板

# 4. 交互式绘图
library(plotly)     # 动态交互

1.2 数据初探 (EDA)

使用 gapminder 数据集,它记录了全球各国半个多世纪的发展轨迹。

data("gapminder")

# 查看数据结构
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
# 数据摘要
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

1.3 数据清洗

为了绘图方便,我们筛选 2007 年的数据,并计算总 GDP。

clean_data_2007 <- gapminder %>%
  filter(year == 2007) %>%
  mutate(
    pop_m = pop / 1e6,           # 人口转换为百万
    gdp_total = gdpPercap * pop, # 计算总GDP
    is_china = ifelse(country == "China", "China", "Other") # 标记中国
  )

head(clean_data_2007)

1.4 分组统计 (Group By)

使用 group_bysummarize 计算各大洲的汇总数据。

continent_summary <- gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(
    avg_lifeExp = mean(lifeExp),     # 平均预期寿命
    total_pop = sum(pop),            # 总人口
    median_gdp = median(gdpPercap),  # 中位数GDP
    n_countries = n()                # 国家数量
  ) %>%
  arrange(desc(avg_lifeExp))

knitr::kable(continent_summary, caption = "2007年各大洲汇总数据")
2007年各大洲汇总数据
continent avg_lifeExp total_pop median_gdp n_countries
Oceania 80.71950 24549947 29810.188 2
Europe 77.64860 586098529 28054.066 30
Americas 73.60812 898871184 8948.103 25
Asia 70.72848 3811953827 4471.062 33
Africa 54.80604 929539692 1452.267 52

模块二:图形语法基础

目标:对比 Base R 与 ggplot2,理解 映射 (Mapping)几何对象 (Geoms)

2.1 Base R vs ggplot2

场景:查看人均 GDP 与 预期寿命 的关系。

2.1.1 Base R (基础绘图 散点图)

Base R 就像在一张纸上画图,需要手动控制每一个细节(如颜色映射、图例)。

# 使用 pch 参数将大洲映射为不同的点形状
plot(clean_data_2007$gdpPercap, clean_data_2007$lifeExp,
     col = clean_data_2007$continent,
     pch = as.numeric(clean_data_2007$continent), 
     main = "Base R Plot", xlab = "GDP", ylab = "Life Expectancy")

# 手动添加图例
legend("bottomright", 
       legend = levels(clean_data_2007$continent), 
       col = 1:length(levels(clean_data_2007$continent)), 
       pch = 1:length(levels(clean_data_2007$continent)),
       cex = 0.7)

2.1.2 Base R 进阶示例1 (Planning.csv)

高级绘图命令产生一个图,低级绘图命令增加点,线,文本,图例等。

# 读取本地数据
mydata <- read.table("./dataset/Planning.csv", header = T) 
head(mydata)
summary(mydata)
##        id              age            relig            ped       
##  Min.   :  1.00   Min.   :18.00   Min.   :1.000   Min.   :0.000  
##  1st Qu.: 63.25   1st Qu.:24.00   1st Qu.:1.000   1st Qu.:2.000  
##  Median :125.50   Median :27.00   Median :1.000   Median :2.000  
##  Mean   :125.64   Mean   :27.43   Mean   :1.108   Mean   :3.283  
##  3rd Qu.:187.75   3rd Qu.:30.00   3rd Qu.:1.000   3rd Qu.:5.000  
##  Max.   :251.00   Max.   :41.00   Max.   :2.000   Max.   :7.000  
##                                   NA's   :1       NA's   :24     
##      income            am            reason           bps           bpd        
##  Min.   :1.000   Min.   :15.00   Min.   :1.000   Min.   :  0   Min.   :  0.00  
##  1st Qu.:1.000   1st Qu.:18.00   1st Qu.:1.000   1st Qu.:110   1st Qu.: 60.00  
##  Median :2.000   Median :20.00   Median :1.000   Median :110   Median : 70.00  
##  Mean   :2.129   Mean   :20.35   Mean   :1.484   Mean   :113   Mean   : 71.69  
##  3rd Qu.:3.000   3rd Qu.:22.00   3rd Qu.:2.000   3rd Qu.:120   3rd Qu.: 80.00  
##  Max.   :6.000   Max.   :31.00   Max.   :3.000   Max.   :170   Max.   :110.00  
##  NA's   :26      NA's   :1       NA's   :2       NA's   :7     NA's   :7       
##        wt              ht       
##  Min.   : 0.00   Min.   :141.0  
##  1st Qu.:46.50   1st Qu.:150.0  
##  Median :51.40   Median :153.0  
##  Mean   :51.89   Mean   :153.5  
##  3rd Qu.:57.00   3rd Qu.:156.0  
##  Max.   :73.80   Max.   :175.0  
##  NA's   :5       NA's   :7
# 绘制箱线图
boxplot(mydata$bps ~ mydata$income, col = "tomato", main = "Income vs BPS")

# 删除缺失值
mydata.omit <- na.omit(mydata)
dim(mydata.omit)
## [1] 210  11
# 绘制散点图
plot(
  mydata.omit$ht,
  mydata.omit$wt,
  pch = 20,
  col = "rosybrown",
  xlab = "Height",
  ylab = "Weight",
  main = "Base R: Plot + Abline + Text"
)

# 增加线性回归线
lm_line <- lm(mydata.omit$wt ~ mydata.omit$ht)
abline(lm_line)

# 增加文本 
text(158, 65, "Weight=0.53*Height-29.36")

# 增加图例
legend("topright", "A legend", col = "rosybrown", pch = 20)

2.1.3 Base R 进阶示例2 (For in one)

mydata <- read.table("./dataset/svmvsrpart10times.csv",header=T)
print(mydata)
##       diags1    diags2   kappas1   kappas2 sensitivity1 sensitivity2
## 1  0.8148148 0.7870370 0.5749705 0.5083135    0.6153846    0.5641026
## 2  0.8148148 0.8240741 0.5847751 0.6032483    0.6666667    0.6666667
## 3  0.7592593 0.7314815 0.4840132 0.4212860    0.6923077    0.6410256
## 4  0.7962963 0.6574074 0.5379230 0.2776573    0.6153846    0.5897436
## 5  0.6944444 0.6666667 0.3109049 0.3157339    0.4871795    0.6666667
## 6  0.7500000 0.7685185 0.4361949 0.4955157    0.5641026    0.6666667
## 7  0.7870370 0.6851852 0.5306122 0.3100338    0.6666667    0.5384615
## 8  0.7500000 0.7314815 0.4551570 0.4212860    0.6410256    0.6410256
## 9  0.7962963 0.7500000 0.5379230 0.4295775    0.6153846    0.5384615
## 10 0.7685185 0.7500000 0.4779582 0.4551570    0.5897436    0.6410256
##    specificity1 specificity2
## 1     0.9275362    0.9130435
## 2     0.8985507    0.9130435
## 3     0.7971014    0.7826087
## 4     0.8985507    0.6956522
## 5     0.8115942    0.6666667
## 6     0.8550725    0.8260870
## 7     0.8550725    0.7681159
## 8     0.8115942    0.7826087
## 9     0.8985507    0.8695652
## 10    0.8695652    0.8115942
plot(mydata$diags1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Accuracy")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$diags1,col="blue")
points(mydata$diags2,col="red",pch=17,cex=0.8)
lines(mydata$diags2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)

par(mfrow=c(2,2))

plot(mydata$diags1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Accuracy")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$diags1,col="blue")
points(mydata$diags2,col="red",pch=17,cex=0.8)
lines(mydata$diags2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)

plot(mydata$kappas1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Kappa")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$kappas1,col="blue")
points(mydata$kappas2,col="red",pch=17,cex=0.8)
lines(mydata$kappas2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)

plot(mydata$sensitivity1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Sensitivity")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$sensitivity1,col="blue")
points(mydata$sensitivity2,col="red",pch=17,cex=0.8)
lines(mydata$sensitivity2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)

plot(mydata$specificity1,pch=19,xlab="",ylab="",ylim=c(0,1),cex=0.8,col="blue",xaxt="n",main="Specificity")
axis(side=1,at=c(1:10),line=NA)
lines(mydata$specificity1,col="blue")
points(mydata$specificity2,col="red",pch=17,cex=0.8)
lines(mydata$specificity2,col="red",lty=3)
legend("bottomright",c("svm","rpart"),col=c("blue","red"),pch=c(18,17),lty=c(1,3),cex=0.8)

par(mfrow=c(1,1))

2.1.4 ggplot2 (图层语法)

ggplot2 基于核心理念:Data + Aes (美学映射) + Geom (几何对象)

  • 美学 (Aesthetics): 图表中对象的视觉属性,例如位置(x 和 y 轴)、颜色、形状、大小、透明度等。
  • 几何对象 (Geoms): 用于在图表中实际呈现数据的几何形状,例如点 (geom_point)、线 (geom_line)、柱 (geom_bar) 等。
  • 分面 (Facets): 一种将数据划分为多个子集并在不同的小图中显示的方法,用于比较不同组的数据。
  • 标签和注释 (Labels and annotations): 用于自定义图表的标题、副标题、轴标签、图例标题以及添加额外的文本或图形注释。
p1 <- ggplot(data = clean_data_2007, 
             mapping = aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  labs(title = "ggplot2 Basic Plot")

p1

2.2 盒型图 (Boxplot)

场景:比较不同大洲的预期寿命分布。

# Base R
boxplot(lifeExp ~ continent, data = clean_data_2007,
        main = "Base R Boxplot", col = "lightblue")

# ggplot2
p_box <- ggplot(clean_data_2007, aes(x = continent, y = lifeExp, fill = continent)) +
  geom_boxplot() +
  labs(title = "ggplot2 Boxplot") +
  theme_minimal() +
  theme(legend.position = "none")

p_box

2.3 散点图与 Palmer Penguins 示例

使用 penguins 数据集展示更丰富的映射(颜色、形状、大小)。

# 基础映射
ggplot(data = penguins, mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)) + 
  geom_point() +
  labs(title = "Penguins: Color Mapping")

# 复杂映射 (颜色+形状+大小)
ggplot(data = penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = species, size = body_mass_g)) + 
  geom_point(alpha = 0.6) +
  labs(title = "Penguins: Complex Mapping")

# 添加平滑曲线
ggplot(data = penguins, mapping = aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point(alpha = 0.6) + 
  geom_smooth() +
  labs(title = "Penguins: With Smoothing Line")

2.4 核心技巧:分面 (Facets)

分面是将数据按照某个变量拆分成多个子图,是比较组间差异的神器。

# 按物种分面
ggplot(data = penguins, mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)) + 
  geom_point() + 
  facet_wrap(~species) +
  labs(title = "Facet by Species")

# 另一种分面示例
ggplot(data = penguins, mapping = aes(x = species, y = body_mass_g, color = species)) + 
  geom_boxplot() + 
  facet_wrap(~island) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Facet by Island")

2.5 恐龙数据集演示 (Datasaurus)

展示为何可视化很重要(统计指标相同,但图形完全不同)。

# 读取本地数据
mydata <- read.table("./dataset/DatasaurusDozen.tsv", header=T, stringsAsFactors=F)
head(mydata)
# 数据摘要对比
table(mydata$dataset)
## 
##       away   bullseye     circle       dino       dots    h_lines high_lines 
##        142        142        142        142        142        142        142 
## slant_down   slant_up       star    v_lines wide_lines    x_shape 
##        142        142        142        142        142        142
summary(subset(mydata, dataset=="away"))
##    dataset                x               y           
##  Length:142         Min.   :15.56   Min.   : 0.01512  
##  Class :character   1st Qu.:39.72   1st Qu.:24.62589  
##  Mode  :character   Median :53.34   Median :47.53527  
##                     Mean   :54.27   Mean   :47.83472  
##                     3rd Qu.:69.15   3rd Qu.:71.80315  
##                     Max.   :91.64   Max.   :97.47577
summary(subset(mydata, dataset=="dino"))
##    dataset                x               y         
##  Length:142         Min.   :22.31   Min.   : 2.949  
##  Class :character   1st Qu.:44.10   1st Qu.:25.288  
##  Mode  :character   Median :53.33   Median :46.026  
##                     Mean   :54.26   Mean   :47.832  
##                     3rd Qu.:64.74   3rd Qu.:68.526  
##                     Max.   :98.21   Max.   :99.487
# 绘图对比
ggplot(mydata, aes(dataset,y, color=dataset))+geom_boxplot()

ggplot(mydata, aes(dataset,y, color=dataset))+geom_violin()

# 散点图
ggplot(mydata, aes(x,y, color=dataset))+geom_point()

# 分面展示
p <- ggplot(mydata, aes(x,y, color=dataset)) +
  geom_point() +
  facet_wrap(~dataset, nrow=3) + 
  scale_color_manual(values=c(brewer.pal(12,"Paired"), "black")) +
  theme(legend.position="none")

print(p)

2.6 条形图 (Barplot)

场景:比较各洲的平均寿命。

# ggplot2: 使用 geom_col 并配合 reorder 进行排序
p_bar <- ggplot(continent_summary, aes(x = reorder(continent, -avg_lifeExp), y = avg_lifeExp)) +
  geom_col(fill = "steelblue") +
  labs(title = "ggplot2 Barplot", x = "Continent", y = "Avg Life Exp") +
  theme_minimal()

p_bar


模块三:审美与出版级绘图

目标:使用 Scale, Theme, ggsci 提升图表质感。

3.1 预备复杂图表

为后续美化做准备,绘制一个包含 4 个维度的复杂散点图。

p2 <- ggplot(data = clean_data_2007, 
             mapping = aes(x = gdpPercap, 
                           y = lifeExp, 
                           color = continent, 
                           size = pop_m)) +
  geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", se = FALSE, color = "grey50")

p2

3.2 坐标轴与 Viridis 配色

对 GDP 进行对数变换 (scale_x_log10),并使用色盲友好的 viridis 配色。

p3 <- p2 +
  scale_x_log10(labels = dollar_format()) + 
  scale_size_continuous(range = c(2, 12), guide = "none") + 
  scale_color_viridis(discrete = TRUE, option = "D") + # 默认值,从亮黄色到绿色、再到深蓝色/紫色。
  labs(
    title = "全球经济与健康状况 (2007)",
    subtitle = "人均GDP与预期寿命呈显著正相关",
    x = "人均 GDP (对数坐标 USD)",
    y = "预期寿命 (岁)",
    color = "大洲",
    caption = "数据来源: Gapminder Project"
  )

p3

3.3 自定义期刊主题

定义一个统一的 Theme 对象,应用到所有图表中。

my_journal_theme <- theme_minimal(base_size = 14) + 
  theme(
    plot.title = element_text(face = "bold", size = 18, color = "#2c3e50"),
    plot.subtitle = element_text(face = "italic", color = "#7f8c8d", margin = margin(b = 15)),
    legend.position = "top",
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text = element_text(color = "black"),
    axis.line = element_line(color = "black")
  )

p4 <- p3 + my_journal_theme
p4

3.4 顶刊配色方案 (ggsci)

使用 ggsci 包一键调用顶级医学期刊的配色。

# JAMA 风格
p_jama <- p2 +
  scale_x_log10(labels = dollar_format()) +
  scale_size_continuous(range = c(2, 12), guide = "none") +
  scale_color_jama() +              
  labs(title = "JAMA Style", subtitle = "ggsci::scale_color_jama") +
  theme_minimal(base_size = 12) + theme(legend.position = "top")

# Lancet 风格
p_lancet <- p2 +
  scale_x_log10(labels = dollar_format()) +
  scale_size_continuous(range = c(2, 12), guide = "none") +
  scale_color_lancet() +            
  labs(title = "Lancet Style", subtitle = "ggsci::scale_color_lancet") +
  theme_minimal(base_size = 12) + theme(legend.position = "top")

# 并排展示
library(cowplot)
plot_grid(p_jama, p_lancet, labels = "AUTO")

3.5 综合案例:Diamonds 数据集与 ggsci

展示如何将 grid.arrangeggsci 结合使用。

data("diamonds")

# 子图1:散点图
sub_p1 <- ggplot(subset(diamonds, carat >= 2.2),
                 aes(x = table, y = price, colour = cut)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "loess", alpha = 0.05, linewidth = 1, span = 1) +
  theme_bw() + 
  labs(title = "Price vs Table")

# 子图2:直方图
sub_p2 <- ggplot(subset(diamonds, carat > 2.2 & depth > 55 & depth < 70),
                 aes(x = depth, fill = cut)) +
  geom_histogram(colour = "black", binwidth = 1, position = "dodge") +
  theme_bw() + 
  labs(title = "Depth Distribution")

# 组合:应用 NPG (Nature Publishing Group) 配色
grid.arrange(
  sub_p1 + scale_color_npg(), 
  sub_p2 + scale_fill_npg(), 
  ncol = 2, 
  top = "Nature Publishing Group Style (ggsci)"
)

3.6 cowplot的多图排版

library(ggplot2)
library(cowplot)
# 箱线图 (Boxplot)
bxp <- ggplot(ToothGrowth, aes(x = factor(dose), y = len, color = factor(dose))) +
  geom_boxplot() +
  scale_color_jco() +
  labs(x = "Dose", y = "Length", color = "Dose") # 添加标签
bxp

dp <- ggplot(ToothGrowth, aes(x = factor(dose), y = len, fill = factor(dose))) +
  # 注意:geom_dotplot 需要 stackgroups=TRUE 和 binaxis="y"
  # binwidth = 1 对应原始函数中的参数
  geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 1.1) + 
  scale_fill_jco() +
  labs(x = "Dose", y = "Length", fill = "Dose")
dp

data("mtcars")
mtcars$name <- rownames(mtcars)
mtcars$cyl <- as.factor(mtcars$cyl)

# 1. 数据预处理:实现分组排序 (sort.by.groups = TRUE, sort.val = "asc")
# 找出每个 name (车型) 在其 cyl 组内的排序位置
mtcars_sorted <- mtcars %>%
  group_by(cyl) %>%
  arrange(mpg) %>%
  mutate(name_rank = row_number()) %>%
  ungroup() %>%
  # 使用 forcats 包的 fct_reorder2 或手动创建排序因子
  # 为了模仿 ggpubr 的分组排序效果,这里按 cyl 和 mpg 的升序创建因子水平
  arrange(cyl, mpg) %>%
  mutate(name = factor(name, levels = unique(name)))

# 2. 绘制棒状图
bp <- ggplot(mtcars_sorted, aes(x = name, y = mpg, fill = cyl)) +
  # 绘制柱状图,stat="identity" 表示 y 值直接取数据中的 mpg
  # color="white" 用于设置柱子边框颜色
  geom_bar(stat = "identity", color = "white") +
  # 配色方案
  scale_fill_jco() +
  # 调整主题:旋转 x 轴文本
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    legend.position = "right"
  ) +
  labs(x = "Car Model", y = "Miles Per Gallon (MPG)", fill = "Cylinders")

bp

library("cowplot")
plot_grid(bxp, dp, bp + theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()), 
          labels = c("A", "B", "C"),
          ncol = 2, nrow = 2)


模块四:数据叙事与交互

目标:突出重点,制作交互式网页图表。

4.1 数据叙事:高亮重点 (Highlighting)

在所有灰色的背景点中,高亮中国、美国等重点国家,引导读者视线。

labels_data <- clean_data_2007 %>%
  filter(country %in% c("China", "United States", "India", "Nigeria"))

p_story <- ggplot(clean_data_2007, aes(x = gdpPercap, y = lifeExp)) +
  # 背景层:灰色
  geom_point(aes(size = pop_m), color = "grey85") +
  # 高亮层:彩色
  geom_point(data = labels_data, aes(size = pop_m, color = continent), alpha = 0.9) +
  # 标签层:避免重叠
  geom_text_repel(data = labels_data, aes(label = country), box.padding = 0.5) +
  
  scale_x_log10(labels = dollar_format()) +
  scale_size(range = c(2, 12), guide = "none") +
  scale_color_brewer(palette = "Set1") +
  my_journal_theme +
  labs(title = "发展中大国的崛起", subtitle = "重点关注:中国 vs 美国")

p_story

4.2 交互式图表 (Plotly)

将静态图表转换为 HTML 交互组件。试着将鼠标悬停在点上

library(plotly)
interactive_plot <- ggplotly(p4, tooltip = c("x", "y", "color", "size"))
interactive_plot

4.3 进阶交互:时间轴动画

重现 Hans Rosling 的经典气泡图动画。点击图表下方的 Play 按钮

p_motion <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, 
                                  size = pop, color = continent, 
                                  frame = year,      # 核心:动画帧
                                  text = country)) +
  geom_point(alpha = 0.7) +
  scale_x_log10(labels = dollar_format()) +
  scale_size(range = c(2, 20), guide = "none") +
  scale_color_viridis(discrete = TRUE, option = "D") +
  theme_minimal() +
  labs(title = "全球发展动态轨迹 (1952-2007)", x = "人均 GDP", y = "预期寿命")

ggplotly(p_motion, tooltip = "text") %>% 
  animation_opts(frame = 1000, easing = "linear", redraw = FALSE) %>% 
  animation_button(x = 1, xanchor = "right", y = 0, yanchor = "bottom")