# install.packages('ggplot2') 安装ggplot2包 install.packages('gcookbook')
# 安装gcookbook包
library(ggplot2) #加载ggplot2包
library(gcookbook) #加载本书的数据包
names(anthoming)
## [1] "angle" "expt" "ctrl"
names(anthoming)[names(anthoming) == "ctrl"] <- c("Control")
names(anthoming)
## [1] "angle" "expt" "Control"
anthoming
## angle expt Control
## 1 -20 1 0
## 2 -10 7 3
## 3 0 2 3
## 4 10 0 3
## 5 20 0 1
anthoming[c(3, 2, 1)]
## Control expt angle
## 1 0 1 -20
## 2 3 7 -10
## 3 3 2 0
## 4 3 0 10
## 5 1 0 20
# 单独检索一列的时候,列表风格的索引会得到数据框
anthoming[3] #数据框
## Control
## 1 0
## 2 3
## 3 3
## 4 3
## 5 1
# 矩阵风格的索引得到的是向量
anthoming[, 3] #向量
## [1] 0 3 3 3 1
anthoming[, 3, drop = FALSE] #数据框
## Control
## 1 0
## 2 3
## 3 3
## 4 3
## 5 1
subset(x, subset, select, drop = FALSE, …)
head(climate)
## Source Year Anomaly1y Anomaly5y Anomaly10y Unc10y
## 1 Berkeley 1800 NA NA -0.435 0.505
## 2 Berkeley 1801 NA NA -0.453 0.493
## 3 Berkeley 1802 NA NA -0.460 0.486
## 4 Berkeley 1803 NA NA -0.493 0.489
## 5 Berkeley 1804 NA NA -0.536 0.483
## 6 Berkeley 1805 NA NA -0.541 0.475
subset(x = climate, subset = (Source == "Berkeley" & Year >= 1900), select = c(Year,
Anomaly10y))[1:5, ]
## Year Anomaly10y
## 101 1900 -0.171
## 102 1901 -0.162
## 103 1902 -0.177
## 104 1903 -0.199
## 105 1904 -0.223
# 因子水平的默认顺序是按字母排序的
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes
## [1] small large large small medium
## Levels: large medium small
sizes <- factor(sizes, levels = c("small", "medium", "large"))
sizes
## [1] small large large small medium
## Levels: small medium large
factor(sizes, levels = rev(levels(sizes))) #因子水平的逆序
## [1] small large large small medium
## Levels: large medium small
reorder(x, X, FUN = mean, …, order = is.ordered(x))
iss <- InsectSprays
iss$spray
## [1] A A A A A A A A A A A A B B B B B B B B B B B B C C C C C C C C C C C
## [36] C D D D D D D D D D D D D E E E E E E E E E E E E F F F F F F F F F F
## [71] F F
## Levels: A B C D E F
iss$spray <- reorder(x = iss$spray, X = iss$count, FUN = mean)
iss$spray #原始因子水平的顺序是ABCDEF,重排后的顺序是CEDABF,新的顺序是由ISS$spray中的每组iss$count的平均值决定。
## [1] A A A A A A A A A A A A B B B B B B B B B B B B C C C C C C C C C C C
## [36] C D D D D D D D D D D D D E E E E E E E E E E E E F F F F F F F F F F
## [71] F F
## attr(,"scores")
## A B C D E F
## 14.500000 15.333333 2.083333 4.916667 3.500000 16.666667
## Levels: C E D A B F
revalue(x, replace = NULL, warn_missing = TRUE) mapvalues(x, from, to, warn_missing = TRUE)
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes
## [1] small large large small medium
## Levels: large medium small
library(plyr) #加载包
# 通过revalue函数,传递一组映射关系
sizes1 <- revalue(sizes, c(small = "S", medium = "M", large = "L"))
sizes1
## [1] S L L S M
## Levels: L M S
# 也可以使用引号--如果因子水平名称中有空格等特殊字符,这将很有用
revalue(sizes, c(small = "S", medium = "M", large = "L"))
## [1] S L L S M
## Levels: L M S
# mapvalues函数使用两组向量,而不是一组映射关系向量
mapvalues(sizes, c("small", "medium", "large"), c("S", "M", "L"))
## [1] S L L S M
## Levels: L M S
# 如果要改变因子所有的水平的名称,可以给levels()传递一个list类型的参数
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes
## [1] small large large small medium
## Levels: large medium small
levels(sizes) <- list(S = "small", M = "medium", L = "large")
sizes
## [1] S L L S M
## Levels: S M L
使用droplevels()函数
sizes <- factor(c("small", "large", "large", "small", "medium"))
sizes
## [1] small large large small medium
## Levels: large medium small
sizes2 <- sizes[1:3]
sizes2
## [1] small large large
## Levels: large medium small
sizes2 <- droplevels(sizes2)
sizes2
## [1] small large large
## Levels: large small
应用plyr包的函数revalue或mapvalues
sizes <- c("small", "large", "large", "small", "medium")
sizes
## [1] "small" "large" "large" "small" "medium"
library(plyr)
revalue(sizes, c(small = "S", medium = "M", large = "L"))
## [1] "S" "L" "L" "S" "M"
revalue(sizes, c(small = "S", medium = "M", large = "L"))
## [1] "S" "L" "L" "S" "M"
mapvalues(sizes, c("small", "medium", "large"), c("S", "M", "L"))
## [1] "S" "L" "L" "S" "M"
pg <- PlantGrowth[c(1, 2, 11, 21, 22), ]
pg
## weight group
## 1 4.17 ctrl
## 2 5.58 ctrl
## 11 4.81 trt1
## 21 6.31 trt2
## 22 5.12 trt2
oldvals <- c("ctrl", "trt1", "trt2")
newvals <- c("No", "Yes", "Yes")
pg$treatment <- newvals[match(pg$group, oldvals)]
match(pg$group, oldvals) #返回pg$group的每个元素在oldvals中的索引
## [1] 1 1 2 3 3
pg
## weight group treatment
## 1 4.17 ctrl No
## 2 5.58 ctrl No
## 11 4.81 trt1 Yes
## 21 6.31 trt2 Yes
## 22 5.12 trt2 Yes
# 一般的向量索引方法,还可以使用&和|连接多个条件:
pg$treatment[pg$group == "ctrl"] <- "No"
pg$treatment[pg$group == "trt1"] <- "Yes"
pg$treatment[pg$group == "trt2"] <- "Yes"
pg
## weight group treatment
## 1 4.17 ctrl No
## 2 5.58 ctrl No
## 11 4.81 trt1 Yes
## 21 6.31 trt2 Yes
## 22 5.12 trt2 Yes
pg$treatment[pg$group == "ctrl" & pg$weight < 5] <- "No_small"
pg$treatment[pg$group == "ctrl" & pg$weight >= 5] <- "No_large"
pg
## weight group treatment
## 1 4.17 ctrl No_small
## 2 5.58 ctrl No_large
## 11 4.81 trt1 Yes
## 21 6.31 trt2 Yes
## 22 5.12 trt2 Yes
使用cut函数可将连续变量分段,
pg <- PlantGrowth[c(1, 2, 11, 21, 22), ]
pg
## weight group
## 1 4.17 ctrl
## 2 5.58 ctrl
## 11 4.81 trt1
## 21 6.31 trt2
## 22 5.12 trt2
pg$wtclass <- cut(pg$weight, breaks = c(0, 5, 6, Inf))
pg
## weight group wtclass
## 1 4.17 ctrl (0,5]
## 2 5.58 ctrl (5,6]
## 11 4.81 trt1 (0,5]
## 21 6.31 trt2 (6,Inf]
## 22 5.12 trt2 (5,6]
pg$wtclass <- cut(pg$weight, breaks = c(0, 5, 6, Inf), labels = c("small", "medium",
"large"))
pg
## weight group wtclass
## 1 4.17 ctrl small
## 2 5.58 ctrl medium
## 11 4.81 trt1 small
## 21 6.31 trt2 large
## 22 5.12 trt2 medium
# cut生成的区间是左开右闭的,即不会包含最小值,但包含最大值,可通过设定参数include.lowest=TRUE使得包含最大最小值
cut(pg$weight, breaks = c(0, 5, 6, Inf), include.lowest = TRUE)
## [1] [0,5] (5,6] [0,5] (6,Inf] (5,6]
## Levels: [0,5] (5,6] (6,Inf]
# 将生成的区间是左闭右开的,可设定参数right=FALSE
cut(pg$weight, breaks = c(0, 5, 6, Inf), right = FALSE)
## [1] [0,5) [5,6) [0,5) [6,Inf) [5,6)
## Levels: [0,5) [5,6) [6,Inf)
library(gcookbook)
hw <- heightweight
head(hw)
## sex ageYear ageMonth heightIn weightLb
## 1 f 11.92 143 56.3 85.0
## 2 f 12.92 155 62.3 105.0
## 3 f 12.75 153 63.3 108.0
## 4 f 13.42 161 59.0 92.0
## 5 f 15.92 191 62.5 112.5
## 6 f 14.25 171 62.5 112.0
# 将heightIn的单位由英尺转换成厘米,然后存储到新的一列heightCm中:
hw$heightCm <- hw$heightIn * 2.54
head(hw)
## sex ageYear ageMonth heightIn weightLb heightCm
## 1 f 11.92 143 56.3 85.0 143.002
## 2 f 12.92 155 62.3 105.0 158.242
## 3 f 12.75 153 63.3 108.0 160.782
## 4 f 13.42 161 59.0 92.0 149.860
## 5 f 15.92 191 62.5 112.5 158.750
## 6 f 14.25 171 62.5 112.0 158.750
# 还可以使用transform函数:
hw <- heightweight
head(hw)
## sex ageYear ageMonth heightIn weightLb
## 1 f 11.92 143 56.3 85.0
## 2 f 12.92 155 62.3 105.0
## 3 f 12.75 153 63.3 108.0
## 4 f 13.42 161 59.0 92.0
## 5 f 15.92 191 62.5 112.5
## 6 f 14.25 171 62.5 112.0
hw <- transform(hw, heightCm = heightIn * 2.54, weightKg = weightLb/2.204)
head(hw)
## sex ageYear ageMonth heightIn weightLb heightCm weightKg
## 1 f 11.92 143 56.3 85.0 143.002 38.56624
## 2 f 12.92 155 62.3 105.0 158.242 47.64065
## 3 f 12.75 153 63.3 108.0 160.782 49.00181
## 4 f 13.42 161 59.0 92.0 149.860 41.74229
## 5 f 15.92 191 62.5 112.5 158.750 51.04356
## 6 f 14.25 171 62.5 112.0 158.750 50.81670
# 还可以使用plyr包的mutate函数
hw <- heightweight
head(hw)
## sex ageYear ageMonth heightIn weightLb
## 1 f 11.92 143 56.3 85.0
## 2 f 12.92 155 62.3 105.0
## 3 f 12.75 153 63.3 108.0
## 4 f 13.42 161 59.0 92.0
## 5 f 15.92 191 62.5 112.5
## 6 f 14.25 171 62.5 112.0
library(plyr)
hw <- mutate(hw, heightCm = heightIn * 2.54, weightKg = weightLb/2.204)
head(hw)
## sex ageYear ageMonth heightIn weightLb heightCm weightKg
## 1 f 11.92 143 56.3 85.0 143.002 38.56624
## 2 f 12.92 155 62.3 105.0 158.242 47.64065
## 3 f 12.75 153 63.3 108.0 160.782 49.00181
## 4 f 13.42 161 59.0 92.0 149.860 41.74229
## 5 f 15.92 191 62.5 112.5 158.750 51.04356
## 6 f 14.25 171 62.5 112.0 158.750 50.81670
# 以下语句结果一样:
hw <- transform(hw, bmi = weightKg/(heightCm/100)^2)
hw <- mutate(hw, bmi = weightKg/(heightCm/100)^2)
hw$bmi <- hw$weightKg/(hw$heightCm/100)^2
# transform和mutate函数的最大区别在于transform会同时计算所有的新列,而mutate将依次计算得到,这样在计算新列是就可使用之前的新列
hw <- heightweight
library(plyr)
hw <- mutate(hw, heightCm = heightIn * 2.54, weightKg = weightLb/2.204, bmi = weightKg/(heightCm/100)^2)
head(hw)
## sex ageYear ageMonth heightIn weightLb heightCm weightKg bmi
## 1 f 11.92 143 56.3 85.0 143.002 38.56624 18.85919
## 2 f 12.92 155 62.3 105.0 158.242 47.64065 19.02542
## 3 f 12.75 153 63.3 108.0 160.782 49.00181 18.95559
## 4 f 13.42 161 59.0 92.0 149.860 41.74229 18.58681
## 5 f 15.92 191 62.5 112.5 158.750 51.04356 20.25412
## 6 f 14.25 171 62.5 112.0 158.750 50.81670 20.16411
使用plyr包中的ddply()函数,在参数中调用transform(),并指定运算
library(MASS)
library(plyr)
head(cabbages)
## Cult Date HeadWt VitC
## 1 c39 d16 2.5 51
## 2 c39 d16 2.2 55
## 3 c39 d16 3.1 45
## 4 c39 d16 4.3 42
## 5 c39 d16 2.5 53
## 6 c39 d16 4.3 50
cb <- ddply(cabbages, "Cult", transform, DevWt = HeadWt - mean(HeadWt))
head(cb)
## Cult Date HeadWt VitC DevWt
## 1 c39 d16 2.5 51 -0.4066667
## 2 c39 d16 2.2 55 -0.7066667
## 3 c39 d16 3.1 45 0.1933333
## 4 c39 d16 4.3 42 1.3933333
## 5 c39 d16 2.5 53 -0.4066667
## 6 c39 d16 4.3 50 1.3933333
cb1 <- ddply(cabbages, c("Cult", "Date"), transform, DevWt = HeadWt - mean(HeadWt),
DevVitC = VitC - mean(VitC))
head(cb1)
## Cult Date HeadWt VitC DevWt DevVitC
## 1 c39 d16 2.5 51 -0.68 0.7
## 2 c39 d16 2.2 55 -0.98 4.7
## 3 c39 d16 3.1 45 -0.08 -5.3
## 4 c39 d16 4.3 42 1.12 -8.3
## 5 c39 d16 2.5 53 -0.68 2.7
## 6 c39 d16 4.3 50 1.12 -0.3
library(MASS)
library(plyr)
ddply(cabbages, c("Cult", "Date"), summarise, Weight = mean(HeadWt), VitC = mean(VitC))
## Cult Date Weight VitC
## 1 c39 d16 3.18 50.3
## 2 c39 d20 2.80 49.4
## 3 c39 d21 2.74 54.8
## 4 c52 d16 2.26 62.5
## 5 c52 d20 3.11 58.9
## 6 c52 d21 1.47 71.8
ddply(cabbages, c("Cult", "Date"), summarise, Weight = mean(HeadWt), sd = sd(VitC),
n = length(HeadWt))
## Cult Date Weight sd n
## 1 c39 d16 3.18 4.270051 10
## 2 c39 d20 2.80 8.329332 10
## 3 c39 d21 2.74 7.568942 10
## 4 c52 d16 2.26 5.797509 10
## 5 c52 d20 3.11 7.738073 10
## 6 c52 d21 1.47 6.196773 10
library(MASS)
library(plyr)
ddply(cabbages, c("Cult", "Date"), summarise, Weight = mean(HeadWt, na.rm = TRUE),
sd = sd(VitC, na.rm = TRUE), n = length(HeadWt), se = sd/sqrt(n))
## Cult Date Weight sd n se
## 1 c39 d16 3.18 4.270051 10 1.350309
## 2 c39 d20 2.80 8.329332 10 2.633966
## 3 c39 d21 2.74 7.568942 10 2.393510
## 4 c52 d16 2.26 5.797509 10 1.833333
## 5 c52 d20 3.11 7.738073 10 2.446994
## 6 c52 d21 1.47 6.196773 10 1.959592
使用reshape2包的melt函数 melt(data, id.vars, measure.vars, variable.name = “variable”, …,na.rm = FALSE, value.name = “value”, factorsAsStrings = TRUE)
library(gcookbook)
anthoming
## angle expt Control
## 1 -20 1 0
## 2 -10 7 3
## 3 0 2 3
## 4 10 0 3
## 5 20 0 1
library(reshape2)
melt(data = anthoming, id.vars = "angle", variable.name = "condition", value.name = "count")
## angle condition count
## 1 -20 expt 1
## 2 -10 expt 7
## 3 0 expt 2
## 4 10 expt 0
## 5 20 expt 0
## 6 -20 Control 0
## 7 -10 Control 3
## 8 0 Control 3
## 9 10 Control 3
## 10 20 Control 1
melt(data = anthoming, id.vars = "angle", measure.vars = "expt", variable.name = "condition",
value.name = "count")
## angle condition count
## 1 -20 expt 1
## 2 -10 expt 7
## 3 0 expt 2
## 4 10 expt 0
## 5 20 expt 0
drunk
## sex 0-29 30-39 40-49 50-59 60+
## 1 male 185 207 260 180 71
## 2 female 4 13 10 7 10
melt(drunk, id.vars = "sex", measure.vars = c("0-29", "30-39"), variable.name = "age",
value.name = "count") #measure.vars指明选入的度量变量
## sex age count
## 1 male 0-29 185
## 2 female 0-29 4
## 3 male 30-39 207
## 4 female 30-39 13
plum_wide
## length time dead alive
## 1 long at_once 84 156
## 2 long in_spring 156 84
## 3 short at_once 133 107
## 4 short in_spring 209 31
melt(plum_wide, id.vars = c("length", "time"), variable.name = "survivial",
value.name = "count") #选择多列作为标识变量
## length time survivial count
## 1 long at_once dead 84
## 2 long in_spring dead 156
## 3 short at_once dead 133
## 4 short in_spring dead 209
## 5 long at_once alive 156
## 6 long in_spring alive 84
## 7 short at_once alive 107
## 8 short in_spring alive 31
使用reshaoe2包的dcast函数 dcast(data, formula, fun.aggregate = NULL, …, margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess_value(data))
library(gcookbook)
library(reshape2)
plum
## length time survival count
## 1 long at_once dead 84
## 2 long in_spring dead 156
## 3 short at_once dead 133
## 4 short in_spring dead 209
## 5 long at_once alive 156
## 6 long in_spring alive 84
## 7 short at_once alive 107
## 8 short in_spring alive 31
dcast(plum, length + time ~ survival, value.var = "count") #类似于excel中的数据透视表,length和time作为行字段,survival作为列字段
## length time dead alive
## 1 long at_once 84 156
## 2 long in_spring 156 84
## 3 short at_once 133 107
## 4 short in_spring 209 31
time()函数可以获取每个观测的时间值,然后使用as.numeric()函数可将时间和该数据转化为数值形式。
nhtemp #数据集
## Time Series:
## Start = 1912
## End = 1971
## Frequency = 1
## [1] 49.9 52.3 49.4 51.1 49.4 47.9 49.8 50.9 49.3 51.9 50.8 49.6 49.3 50.6
## [15] 48.4 50.7 50.9 50.6 51.5 52.8 51.8 51.1 49.8 50.2 50.4 51.6 51.8 50.9
## [29] 48.8 51.7 51.0 50.6 51.7 51.5 52.1 51.3 51.0 54.0 51.4 52.7 53.1 54.6
## [43] 52.0 52.0 50.9 52.6 50.2 52.6 51.6 51.9 50.5 50.9 51.7 51.4 51.7 50.8
## [57] 51.9 51.8 51.9 53.0
as.numeric(time(nhtemp))
## [1] 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925
## [15] 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939
## [29] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
## [43] 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
## [57] 1968 1969 1970 1971
as.numeric(nhtemp)
## [1] 49.9 52.3 49.4 51.1 49.4 47.9 49.8 50.9 49.3 51.9 50.8 49.6 49.3 50.6
## [15] 48.4 50.7 50.9 50.6 51.5 52.8 51.8 51.1 49.8 50.2 50.4 51.6 51.8 50.9
## [29] 48.8 51.7 51.0 50.6 51.7 51.5 52.1 51.3 51.0 54.0 51.4 52.7 53.1 54.6
## [43] 52.0 52.0 50.9 52.6 50.2 52.6 51.6 51.9 50.5 50.9 51.7 51.4 51.7 50.8
## [57] 51.9 51.8 51.9 53.0
# 合并成数据框的形式
nht <- data.frame(year = as.numeric(time(nhtemp)), temp = as.numeric(nhtemp))
nht
## year temp
## 1 1912 49.9
## 2 1913 52.3
## 3 1914 49.4
## 4 1915 51.1
## 5 1916 49.4
## 6 1917 47.9
## 7 1918 49.8
## 8 1919 50.9
## 9 1920 49.3
## 10 1921 51.9
## 11 1922 50.8
## 12 1923 49.6
## 13 1924 49.3
## 14 1925 50.6
## 15 1926 48.4
## 16 1927 50.7
## 17 1928 50.9
## 18 1929 50.6
## 19 1930 51.5
## 20 1931 52.8
## 21 1932 51.8
## 22 1933 51.1
## 23 1934 49.8
## 24 1935 50.2
## 25 1936 50.4
## 26 1937 51.6
## 27 1938 51.8
## 28 1939 50.9
## 29 1940 48.8
## 30 1941 51.7
## 31 1942 51.0
## 32 1943 50.6
## 33 1944 51.7
## 34 1945 51.5
## 35 1946 52.1
## 36 1947 51.3
## 37 1948 51.0
## 38 1949 54.0
## 39 1950 51.4
## 40 1951 52.7
## 41 1952 53.1
## 42 1953 54.6
## 43 1954 52.0
## 44 1955 52.0
## 45 1956 50.9
## 46 1957 52.6
## 47 1958 50.2
## 48 1959 52.6
## 49 1960 51.6
## 50 1961 51.9
## 51 1962 50.5
## 52 1963 50.9
## 53 1964 51.7
## 54 1965 51.4
## 55 1966 51.7
## 56 1967 50.8
## 57 1968 51.9
## 58 1969 51.8
## 59 1970 51.9
## 60 1971 53.0