R数据可视化手册第15章数据塑形

# install.packages('ggplot2') 安装ggplot2包 install.packages('gcookbook')
# 安装gcookbook包
library(ggplot2)  #加载ggplot2包
library(gcookbook)  #加载本书的数据包

15.5 重命名数据框的列名

使用names(data)<-函数即可

names(anthoming)

## [1] "angle" "expt"  "ctrl"

names(anthoming)[names(anthoming) == "ctrl"] <- c("Control")
names(anthoming)

## [1] "angle"   "expt"    "Control"

15.6 重排序数据框的列

通过列的数值位置重排序，亦可通过列的名称重排序

anthoming

##   angle expt Control
## 1   -20    1       0
## 2   -10    7       3
## 3     0    2       3
## 4    10    0       3
## 5    20    0       1

anthoming[c(3, 2, 1)]

##   Control expt angle
## 1       0    1   -20
## 2       3    7   -10
## 3       3    2     0
## 4       3    0    10
## 5       1    0    20

# 单独检索一列的时候，列表风格的索引会得到数据框
anthoming[3]  #数据框

##   Control
## 1       0
## 2       3
## 3       3
## 4       3
## 5       1

# 矩阵风格的索引得到的是向量
anthoming[, 3]  #向量

## [1] 0 3 3 3 1

anthoming[, 3, drop = FALSE]  #数据框

##   Control
## 1       0
## 2       3
## 3       3
## 4       3
## 5       1

15.7 从数据框中提取子集

使用subset函数筛选出符合一系列条件的行和列

subset(x, subset, select, drop = FALSE, …)

x 数据框
subset: 观测筛选的条件
select: 数据框选择的变量
drop:
…:

head(climate)

##     Source Year Anomaly1y Anomaly5y Anomaly10y Unc10y
## 1 Berkeley 1800        NA        NA     -0.435  0.505
## 2 Berkeley 1801        NA        NA     -0.453  0.493
## 3 Berkeley 1802        NA        NA     -0.460  0.486
## 4 Berkeley 1803        NA        NA     -0.493  0.489
## 5 Berkeley 1804        NA        NA     -0.536  0.483
## 6 Berkeley 1805        NA        NA     -0.541  0.475

subset(x = climate, subset = (Source == "Berkeley" & Year >= 1900), select = c(Year, 
    Anomaly10y))[1:5, ]

##     Year Anomaly10y
## 101 1900     -0.171
## 102 1901     -0.162
## 103 1902     -0.177
## 104 1903     -0.199
## 105 1904     -0.223

15.8 改变因子水平的顺序

因子的水平可以由函数factor具体设定

# 因子水平的默认顺序是按字母排序的
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes

## [1] small  large  large  small  medium
## Levels: large medium small

sizes <- factor(sizes, levels = c("small", "medium", "large"))
sizes

## [1] small  large  large  small  medium
## Levels: small medium large

factor(sizes, levels = rev(levels(sizes)))  #因子水平的逆序

## [1] small  large  large  small  medium
## Levels: large medium small

15.9 根据数据的值改变因因子水平的顺序

使用reorder函数

reorder(x, X, FUN = mean, …, order = is.ordered(x))

x：因子
X：排序依据的数据
FUN：汇总数据的函数

iss <- InsectSprays
iss$spray

##  [1] A A A A A A A A A A A A B B B B B B B B B B B B C C C C C C C C C C C
## [36] C D D D D D D D D D D D D E E E E E E E E E E E E F F F F F F F F F F
## [71] F F
## Levels: A B C D E F

iss$spray <- reorder(x = iss$spray, X = iss$count, FUN = mean)
iss$spray  #原始因子水平的顺序是ABCDEF，重排后的顺序是CEDABF，新的顺序是由ISS$spray中的每组iss$count的平均值决定。

##  [1] A A A A A A A A A A A A B B B B B B B B B B B B C C C C C C C C C C C
## [36] C D D D D D D D D D D D D E E E E E E E E E E E E F F F F F F F F F F
## [71] F F
## attr(,"scores")
##         A         B         C         D         E         F 
## 14.500000 15.333333  2.083333  4.916667  3.500000 16.666667 
## Levels: C E D A B F

15.10 改变因子水平的名称

使用plyr包的revalue()函数和mapvalues()函数。

revalue(x, replace = NULL, warn_missing = TRUE) mapvalues(x, from, to, warn_missing = TRUE)

sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes

## [1] small  large  large  small  medium
## Levels: large medium small

library(plyr)  #加载包
# 通过revalue函数，传递一组映射关系
sizes1 <- revalue(sizes, c(small = "S", medium = "M", large = "L"))
sizes1

## [1] S L L S M
## Levels: L M S

# 也可以使用引号--如果因子水平名称中有空格等特殊字符，这将很有用
revalue(sizes, c(small = "S", medium = "M", large = "L"))

## [1] S L L S M
## Levels: L M S

# mapvalues函数使用两组向量，而不是一组映射关系向量
mapvalues(sizes, c("small", "medium", "large"), c("S", "M", "L"))

## [1] S L L S M
## Levels: L M S

# 如果要改变因子所有的水平的名称，可以给levels()传递一个list类型的参数
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes

## [1] small  large  large  small  medium
## Levels: large medium small

levels(sizes) <- list(S = "small", M = "medium", L = "large")
sizes

## [1] S L L S M
## Levels: S M L

15.11 去掉因子中不再使用的水平

使用droplevels()函数

sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes

## [1] small  large  large  small  medium
## Levels: large medium small

sizes2 <- sizes[1:3]
sizes2

## [1] small large large
## Levels: large medium small

sizes2 <- droplevels(sizes2)
sizes2

## [1] small large large
## Levels: large small

15.12 在字符向量中改变元素的名称

应用plyr包的函数revalue或mapvalues

sizes <- c("small", "large", "large", "small", "medium")
sizes

## [1] "small"  "large"  "large"  "small"  "medium"

library(plyr)
revalue(sizes, c(small = "S", medium = "M", large = "L"))

## [1] "S" "L" "L" "S" "M"

revalue(sizes, c(small = "S", medium = "M", large = "L"))

## [1] "S" "L" "L" "S" "M"

mapvalues(sizes, c("small", "medium", "large"), c("S", "M", "L"))

## [1] "S" "L" "L" "S" "M"

15.13 把一个分类变量转化成另一个分类变量

pg <- PlantGrowth[c(1, 2, 11, 21, 22), ]
pg

##    weight group
## 1    4.17  ctrl
## 2    5.58  ctrl
## 11   4.81  trt1
## 21   6.31  trt2
## 22   5.12  trt2

oldvals <- c("ctrl", "trt1", "trt2")
newvals <- c("No", "Yes", "Yes")
pg$treatment <- newvals[match(pg$group, oldvals)]
match(pg$group, oldvals)  #返回pg$group的每个元素在oldvals中的索引

## [1] 1 1 2 3 3

pg

##    weight group treatment
## 1    4.17  ctrl        No
## 2    5.58  ctrl        No
## 11   4.81  trt1       Yes
## 21   6.31  trt2       Yes
## 22   5.12  trt2       Yes

# 一般的向量索引方法,还可以使用&和|连接多个条件：
pg$treatment[pg$group == "ctrl"] <- "No"
pg$treatment[pg$group == "trt1"] <- "Yes"
pg$treatment[pg$group == "trt2"] <- "Yes"
pg

##    weight group treatment
## 1    4.17  ctrl        No
## 2    5.58  ctrl        No
## 11   4.81  trt1       Yes
## 21   6.31  trt2       Yes
## 22   5.12  trt2       Yes

pg$treatment[pg$group == "ctrl" & pg$weight < 5] <- "No_small"
pg$treatment[pg$group == "ctrl" & pg$weight >= 5] <- "No_large"
pg

##    weight group treatment
## 1    4.17  ctrl  No_small
## 2    5.58  ctrl  No_large
## 11   4.81  trt1       Yes
## 21   6.31  trt2       Yes
## 22   5.12  trt2       Yes

15.14 将连续变量转变为分类变量

使用cut函数可将连续变量分段,

pg <- PlantGrowth[c(1, 2, 11, 21, 22), ]
pg

##    weight group
## 1    4.17  ctrl
## 2    5.58  ctrl
## 11   4.81  trt1
## 21   6.31  trt2
## 22   5.12  trt2

pg$wtclass <- cut(pg$weight, breaks = c(0, 5, 6, Inf))
pg

##    weight group wtclass
## 1    4.17  ctrl   (0,5]
## 2    5.58  ctrl   (5,6]
## 11   4.81  trt1   (0,5]
## 21   6.31  trt2 (6,Inf]
## 22   5.12  trt2   (5,6]

pg$wtclass <- cut(pg$weight, breaks = c(0, 5, 6, Inf), labels = c("small", "medium", 
    "large"))
pg

##    weight group wtclass
## 1    4.17  ctrl   small
## 2    5.58  ctrl  medium
## 11   4.81  trt1   small
## 21   6.31  trt2   large
## 22   5.12  trt2  medium

# cut生成的区间是左开右闭的，即不会包含最小值，但包含最大值，可通过设定参数include.lowest=TRUE使得包含最大最小值
cut(pg$weight, breaks = c(0, 5, 6, Inf), include.lowest = TRUE)

## [1] [0,5]   (5,6]   [0,5]   (6,Inf] (5,6]  
## Levels: [0,5] (5,6] (6,Inf]

# 将生成的区间是左闭右开的，可设定参数right=FALSE
cut(pg$weight, breaks = c(0, 5, 6, Inf), right = FALSE)

## [1] [0,5)   [5,6)   [0,5)   [6,Inf) [5,6)  
## Levels: [0,5) [5,6) [6,Inf)

15.15 变量转换

library(gcookbook)
hw <- heightweight
head(hw)

##   sex ageYear ageMonth heightIn weightLb
## 1   f   11.92      143     56.3     85.0
## 2   f   12.92      155     62.3    105.0
## 3   f   12.75      153     63.3    108.0
## 4   f   13.42      161     59.0     92.0
## 5   f   15.92      191     62.5    112.5
## 6   f   14.25      171     62.5    112.0

# 将heightIn的单位由英尺转换成厘米，然后存储到新的一列heightCm中：
hw$heightCm <- hw$heightIn * 2.54
head(hw)

##   sex ageYear ageMonth heightIn weightLb heightCm
## 1   f   11.92      143     56.3     85.0  143.002
## 2   f   12.92      155     62.3    105.0  158.242
## 3   f   12.75      153     63.3    108.0  160.782
## 4   f   13.42      161     59.0     92.0  149.860
## 5   f   15.92      191     62.5    112.5  158.750
## 6   f   14.25      171     62.5    112.0  158.750

# 还可以使用transform函数：
hw <- heightweight
head(hw)

##   sex ageYear ageMonth heightIn weightLb
## 1   f   11.92      143     56.3     85.0
## 2   f   12.92      155     62.3    105.0
## 3   f   12.75      153     63.3    108.0
## 4   f   13.42      161     59.0     92.0
## 5   f   15.92      191     62.5    112.5
## 6   f   14.25      171     62.5    112.0

hw <- transform(hw, heightCm = heightIn * 2.54, weightKg = weightLb/2.204)
head(hw)

##   sex ageYear ageMonth heightIn weightLb heightCm weightKg
## 1   f   11.92      143     56.3     85.0  143.002 38.56624
## 2   f   12.92      155     62.3    105.0  158.242 47.64065
## 3   f   12.75      153     63.3    108.0  160.782 49.00181
## 4   f   13.42      161     59.0     92.0  149.860 41.74229
## 5   f   15.92      191     62.5    112.5  158.750 51.04356
## 6   f   14.25      171     62.5    112.0  158.750 50.81670

# 还可以使用plyr包的mutate函数
hw <- heightweight
head(hw)

##   sex ageYear ageMonth heightIn weightLb
## 1   f   11.92      143     56.3     85.0
## 2   f   12.92      155     62.3    105.0
## 3   f   12.75      153     63.3    108.0
## 4   f   13.42      161     59.0     92.0
## 5   f   15.92      191     62.5    112.5
## 6   f   14.25      171     62.5    112.0

library(plyr)
hw <- mutate(hw, heightCm = heightIn * 2.54, weightKg = weightLb/2.204)
head(hw)

##   sex ageYear ageMonth heightIn weightLb heightCm weightKg
## 1   f   11.92      143     56.3     85.0  143.002 38.56624
## 2   f   12.92      155     62.3    105.0  158.242 47.64065
## 3   f   12.75      153     63.3    108.0  160.782 49.00181
## 4   f   13.42      161     59.0     92.0  149.860 41.74229
## 5   f   15.92      191     62.5    112.5  158.750 51.04356
## 6   f   14.25      171     62.5    112.0  158.750 50.81670

# 以下语句结果一样：
hw <- transform(hw, bmi = weightKg/(heightCm/100)^2)
hw <- mutate(hw, bmi = weightKg/(heightCm/100)^2)
hw$bmi <- hw$weightKg/(hw$heightCm/100)^2
# transform和mutate函数的最大区别在于transform会同时计算所有的新列，而mutate将依次计算得到，这样在计算新列是就可使用之前的新列
hw <- heightweight
library(plyr)
hw <- mutate(hw, heightCm = heightIn * 2.54, weightKg = weightLb/2.204, bmi = weightKg/(heightCm/100)^2)
head(hw)

##   sex ageYear ageMonth heightIn weightLb heightCm weightKg      bmi
## 1   f   11.92      143     56.3     85.0  143.002 38.56624 18.85919
## 2   f   12.92      155     62.3    105.0  158.242 47.64065 19.02542
## 3   f   12.75      153     63.3    108.0  160.782 49.00181 18.95559
## 4   f   13.42      161     59.0     92.0  149.860 41.74229 18.58681
## 5   f   15.92      191     62.5    112.5  158.750 51.04356 20.25412
## 6   f   14.25      171     62.5    112.0  158.750 50.81670 20.16411

15.16 按组转换数据

使用plyr包中的ddply()函数，在参数中调用transform()，并指定运算

library(MASS)
library(plyr)
head(cabbages)

##   Cult Date HeadWt VitC
## 1  c39  d16    2.5   51
## 2  c39  d16    2.2   55
## 3  c39  d16    3.1   45
## 4  c39  d16    4.3   42
## 5  c39  d16    2.5   53
## 6  c39  d16    4.3   50

cb <- ddply(cabbages, "Cult", transform, DevWt = HeadWt - mean(HeadWt))
head(cb)

##   Cult Date HeadWt VitC      DevWt
## 1  c39  d16    2.5   51 -0.4066667
## 2  c39  d16    2.2   55 -0.7066667
## 3  c39  d16    3.1   45  0.1933333
## 4  c39  d16    4.3   42  1.3933333
## 5  c39  d16    2.5   53 -0.4066667
## 6  c39  d16    4.3   50  1.3933333

cb1 <- ddply(cabbages, c("Cult", "Date"), transform, DevWt = HeadWt - mean(HeadWt), 
    DevVitC = VitC - mean(VitC))
head(cb1)

##   Cult Date HeadWt VitC DevWt DevVitC
## 1  c39  d16    2.5   51 -0.68     0.7
## 2  c39  d16    2.2   55 -0.98     4.7
## 3  c39  d16    3.1   45 -0.08    -5.3
## 4  c39  d16    4.3   42  1.12    -8.3
## 5  c39  d16    2.5   53 -0.68     2.7
## 6  c39  d16    4.3   50  1.12    -0.3

15.17 分组汇总数据

library(MASS)
library(plyr)
ddply(cabbages, c("Cult", "Date"), summarise, Weight = mean(HeadWt), VitC = mean(VitC))

##   Cult Date Weight VitC
## 1  c39  d16   3.18 50.3
## 2  c39  d20   2.80 49.4
## 3  c39  d21   2.74 54.8
## 4  c52  d16   2.26 62.5
## 5  c52  d20   3.11 58.9
## 6  c52  d21   1.47 71.8

ddply(cabbages, c("Cult", "Date"), summarise, Weight = mean(HeadWt), sd = sd(VitC), 
    n = length(HeadWt))

##   Cult Date Weight       sd  n
## 1  c39  d16   3.18 4.270051 10
## 2  c39  d20   2.80 8.329332 10
## 3  c39  d21   2.74 7.568942 10
## 4  c52  d16   2.26 5.797509 10
## 5  c52  d20   3.11 7.738073 10
## 6  c52  d21   1.47 6.196773 10

15.18 使用标准误差和置信区间来汇总数据

library(MASS)
library(plyr)
ddply(cabbages, c("Cult", "Date"), summarise, Weight = mean(HeadWt, na.rm = TRUE), 
    sd = sd(VitC, na.rm = TRUE), n = length(HeadWt), se = sd/sqrt(n))

##   Cult Date Weight       sd  n       se
## 1  c39  d16   3.18 4.270051 10 1.350309
## 2  c39  d20   2.80 8.329332 10 2.633966
## 3  c39  d21   2.74 7.568942 10 2.393510
## 4  c52  d16   2.26 5.797509 10 1.833333
## 5  c52  d20   3.11 7.738073 10 2.446994
## 6  c52  d21   1.47 6.196773 10 1.959592

15.19 把数据框从“宽”变“长”

使用reshape2包的melt函数 melt(data, id.vars, measure.vars, variable.name = “variable”, …,na.rm = FALSE, value.name = “value”, factorsAsStrings = TRUE)

data:需要处理的数据集
id.vars:标识标量，即保留的变量，如果不指定，默认所有的非度量变量
measure.vars:度量变量，
variable.name:度量变量的名称构成的新变量取值，这个新变量的名称
value.name:度量变量的值构成的新变量的名称
na.rm:是否移除缺失数据

library(gcookbook)
anthoming

##   angle expt Control
## 1   -20    1       0
## 2   -10    7       3
## 3     0    2       3
## 4    10    0       3
## 5    20    0       1

library(reshape2)
melt(data = anthoming, id.vars = "angle", variable.name = "condition", value.name = "count")

##    angle condition count
## 1    -20      expt     1
## 2    -10      expt     7
## 3      0      expt     2
## 4     10      expt     0
## 5     20      expt     0
## 6    -20   Control     0
## 7    -10   Control     3
## 8      0   Control     3
## 9     10   Control     3
## 10    20   Control     1

melt(data = anthoming, id.vars = "angle", measure.vars = "expt", variable.name = "condition", 
    value.name = "count")

##   angle condition count
## 1   -20      expt     1
## 2   -10      expt     7
## 3     0      expt     2
## 4    10      expt     0
## 5    20      expt     0

drunk

##      sex 0-29 30-39 40-49 50-59 60+
## 1   male  185   207   260   180  71
## 2 female    4    13    10     7  10

melt(drunk, id.vars = "sex", measure.vars = c("0-29", "30-39"), variable.name = "age", 
    value.name = "count")  #measure.vars指明选入的度量变量

##      sex   age count
## 1   male  0-29   185
## 2 female  0-29     4
## 3   male 30-39   207
## 4 female 30-39    13

plum_wide

##   length      time dead alive
## 1   long   at_once   84   156
## 2   long in_spring  156    84
## 3  short   at_once  133   107
## 4  short in_spring  209    31

melt(plum_wide, id.vars = c("length", "time"), variable.name = "survivial", 
    value.name = "count")  #选择多列作为标识变量

##   length      time survivial count
## 1   long   at_once      dead    84
## 2   long in_spring      dead   156
## 3  short   at_once      dead   133
## 4  short in_spring      dead   209
## 5   long   at_once     alive   156
## 6   long in_spring     alive    84
## 7  short   at_once     alive   107
## 8  short in_spring     alive    31

15.把数据框从“长”变“宽”

使用reshaoe2包的dcast函数 dcast(data, formula, fun.aggregate = NULL, …, margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess_value(data))

library(gcookbook)
library(reshape2)
plum

##   length      time survival count
## 1   long   at_once     dead    84
## 2   long in_spring     dead   156
## 3  short   at_once     dead   133
## 4  short in_spring     dead   209
## 5   long   at_once    alive   156
## 6   long in_spring    alive    84
## 7  short   at_once    alive   107
## 8  short in_spring    alive    31

dcast(plum, length + time ~ survival, value.var = "count")  #类似于excel中的数据透视表，length和time作为行字段，survival作为列字段

##   length      time dead alive
## 1   long   at_once   84   156
## 2   long in_spring  156    84
## 3  short   at_once  133   107
## 4  short in_spring  209    31

15.21 把时间序列数据对象拆分成时间和数据

time()函数可以获取每个观测的时间值，然后使用as.numeric()函数可将时间和该数据转化为数值形式。

nhtemp  #数据集

## Time Series:
## Start = 1912 
## End = 1971 
## Frequency = 1 
##  [1] 49.9 52.3 49.4 51.1 49.4 47.9 49.8 50.9 49.3 51.9 50.8 49.6 49.3 50.6
## [15] 48.4 50.7 50.9 50.6 51.5 52.8 51.8 51.1 49.8 50.2 50.4 51.6 51.8 50.9
## [29] 48.8 51.7 51.0 50.6 51.7 51.5 52.1 51.3 51.0 54.0 51.4 52.7 53.1 54.6
## [43] 52.0 52.0 50.9 52.6 50.2 52.6 51.6 51.9 50.5 50.9 51.7 51.4 51.7 50.8
## [57] 51.9 51.8 51.9 53.0

as.numeric(time(nhtemp))

##  [1] 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925
## [15] 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939
## [29] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
## [43] 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
## [57] 1968 1969 1970 1971

as.numeric(nhtemp)

##  [1] 49.9 52.3 49.4 51.1 49.4 47.9 49.8 50.9 49.3 51.9 50.8 49.6 49.3 50.6
## [15] 48.4 50.7 50.9 50.6 51.5 52.8 51.8 51.1 49.8 50.2 50.4 51.6 51.8 50.9
## [29] 48.8 51.7 51.0 50.6 51.7 51.5 52.1 51.3 51.0 54.0 51.4 52.7 53.1 54.6
## [43] 52.0 52.0 50.9 52.6 50.2 52.6 51.6 51.9 50.5 50.9 51.7 51.4 51.7 50.8
## [57] 51.9 51.8 51.9 53.0

# 合并成数据框的形式
nht <- data.frame(year = as.numeric(time(nhtemp)), temp = as.numeric(nhtemp))
nht

##    year temp
## 1  1912 49.9
## 2  1913 52.3
## 3  1914 49.4
## 4  1915 51.1
## 5  1916 49.4
## 6  1917 47.9
## 7  1918 49.8
## 8  1919 50.9
## 9  1920 49.3
## 10 1921 51.9
## 11 1922 50.8
## 12 1923 49.6
## 13 1924 49.3
## 14 1925 50.6
## 15 1926 48.4
## 16 1927 50.7
## 17 1928 50.9
## 18 1929 50.6
## 19 1930 51.5
## 20 1931 52.8
## 21 1932 51.8
## 22 1933 51.1
## 23 1934 49.8
## 24 1935 50.2
## 25 1936 50.4
## 26 1937 51.6
## 27 1938 51.8
## 28 1939 50.9
## 29 1940 48.8
## 30 1941 51.7
## 31 1942 51.0
## 32 1943 50.6
## 33 1944 51.7
## 34 1945 51.5
## 35 1946 52.1
## 36 1947 51.3
## 37 1948 51.0
## 38 1949 54.0
## 39 1950 51.4
## 40 1951 52.7
## 41 1952 53.1
## 42 1953 54.6
## 43 1954 52.0
## 44 1955 52.0
## 45 1956 50.9
## 46 1957 52.6
## 47 1958 50.2
## 48 1959 52.6
## 49 1960 51.6
## 50 1961 51.9
## 51 1962 50.5
## 52 1963 50.9
## 53 1964 51.7
## 54 1965 51.4
## 55 1966 51.7
## 56 1967 50.8
## 57 1968 51.9
## 58 1969 51.8
## 59 1970 51.9
## 60 1971 53.0