1 初级数据管理

1.1 创建数据框

manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age, q1, q2, q3, q4, q5, stringsAsFactors=FALSE)

1.2 在数据框leadership中创建变量sumx, meanx

leadership$sumx = leadership$q1 + leadership$q2 
leadership$meanx = (leadership$q1 + leadership$q2)/2

#另外一种写法:

attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, country, date, gender, manager, q1, q2, q3, q4, q5
   leadership$meanx <- (q1 + q2)/2
   leadership$sumx <- q1 + q2
   # meanx <- (q1 + q2)/2
   # sumx <- q1 + q2
detach(leadership)

leadership
##   manager     date country gender age q1 q2 q3 q4 q5 sumx meanx
## 1       1 10/24/08      US      M  32  5  4  5  5  5    9   4.5
## 2       2 10/28/08      US      F  45  3  5  2  5  5    8   4.0
## 3       3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0
## 4       4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0
## 5       5   5/1/09      UK      F  99  2  2  1  2  1    4   2.0
#第3种更为简洁的方式:
leadership <- transform(leadership,  sumx = q1 + q2,  meanx = (q1 + q2)/2)

leadership
##   manager     date country gender age q1 q2 q3 q4 q5 sumx meanx
## 1       1 10/24/08      US      M  32  5  4  5  5  5    9   4.5
## 2       2 10/28/08      US      F  45  3  5  2  5  5    8   4.0
## 3       3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0
## 4       4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0
## 5       5   5/1/09      UK      F  99  2  2  1  2  1    4   2.0
leadership <-  plyr::mutate(leadership,  sumx = NULL,  meanx = NULL)

leadership <-  plyr::mutate(leadership,  sumx =q1 + q2,  meanx = (q1 + q2)/2)

leadership
##   manager     date country gender age q1 q2 q3 q4 q5 sumx meanx
## 1       1 10/24/08      US      M  32  5  4  5  5  5    9   4.5
## 2       2 10/28/08      US      F  45  3  5  2  5  5    8   4.0
## 3       3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0
## 4       4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0
## 5       5   5/1/09      UK      F  99  2  2  1  2  1    4   2.0

1.3 数据框中变量的重编码

将leadership数据框中经理人的连续型年龄变量age重编码为类别型变量agecat(Young、 Middle Aged、Elder)

leadership$age[ leadership$age == 99 ] <- NA             #替换缺失值

leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[leadership$age >= 55 &
                  leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"

#或者用下列函数更简洁地实现:
leadership <- within(leadership,{
  agecat <- NA
  agecat[age > 75] <- "Elder"
  agecat[age >= 55 & age <= 75] <- "Middle Aged"
  agecat[age < 55] <- "Young" })

leadership
##   manager     date country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1       1 10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2       2 10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3       3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4       4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5       5   5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>

1.4 数据框中变量的重命名

#交互的方式
fix(leadership)

#编程的方式
library(plyr)   #或者library(reshape)
#rename(dataframe, c(oldname="newname", oldname="newname",…))
#注意这时并没有改变dataframe ,而是返回了一个新的数据框

rename(leadership, c(manager="managerID", date="testDate"))
##   managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1         1 10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2         2 10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3         3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4         4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5         5   5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>
leadership
##   manager     date country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1       1 10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2       2 10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3       3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4       4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5       5   5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>
leadership <- rename(leadership, c(manager="managerID", date="testDate")  )
leadership
##   managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1         1 10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2         2 10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3         3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4         4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5         5   5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>
#不同包中的同名函数可能调用方式不一样:
#如果不指明包名,后边加载的包里边的函数会屏蔽前面的
#如,在dplyr中 rename(dataframe, newname= oldname, newname= oldname,…))
#为了明确调用不发生混淆,在函数名前指定包:
# dplyr::rename: 

library(dplyr)   
## 
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dplyr::rename(leadership,  date = testDate)
##   managerID     date country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1         1 10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2         2 10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3         3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4         4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5         5   5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>
leadership
##   managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1         1 10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2         2 10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3         3  10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4         4 10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5         5   5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>
#也可以通过names()函数来重命名变量
names(leadership)[2] <- "testDate "
leadership
##   managerID testDate  country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1         1  10/24/08      US      M  32  5  4  5  5  5    9   4.5  Young
## 2         2  10/28/08      US      F  45  3  5  2  5  5    8   4.0  Young
## 3         3   10/1/08      UK      F  25  3  5  5  5  2    8   4.0  Young
## 4         4  10/12/08      UK      M  39  3  3  4 NA NA    6   3.0  Young
## 5         5    5/1/09      UK      F  NA  2  2  1  2  1    4   2.0   <NA>
#或者直接对leadership中的若干相应变量的名称进行修改:
names(leadership)[6:10] <- c("item1", "item2", "item3", "item4", "item5")
leadership
##   managerID testDate  country gender age item1 item2 item3 item4 item5 sumx
## 1         1  10/24/08      US      M  32     5     4     5     5     5    9
## 2         2  10/28/08      US      F  45     3     5     2     5     5    8
## 3         3   10/1/08      UK      F  25     3     5     5     5     2    8
## 4         4  10/12/08      UK      M  39     3     3     4    NA    NA    6
## 5         5    5/1/09      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>

1.5 缺失值NA

缺失值以符号NA(Not Available,不可用)表示; 不可能出现的值(例如,被0除的结果)通过符号NaN(Not a Number,非数值)来表示。

is.na(leadership[,6:10])
##   item1 item2 item3 item4 item5
## 1 FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE  TRUE  TRUE
## 5 FALSE FALSE FALSE FALSE FALSE
#利用na.omit()删除不完整的观测(行)
newdata <- na.omit(leadership)
newdata
##   managerID testDate  country gender age item1 item2 item3 item4 item5 sumx
## 1         1  10/24/08      US      M  32     5     4     5     5     5    9
## 2         2  10/28/08      US      F  45     3     5     2     5     5    8
## 3         3   10/1/08      UK      F  25     3     5     5     5     2    8
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young

1.6 日期处理

#从字符串转为日期变量:
names(leadership)[2] <- "Date"

myformat <- "%m/%d/%y"
leadership$Date <- as.Date(leadership$Date,  myformat)
leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>
#处理时间戳数据的函数
#Sys.Date()可以返回当天的日期(系统时间),而date()则返回当前的日期和时间
Sys.Date()
## [1] "2023-02-27"
date()
## [1] "Mon Feb 27 17:52:23 2023"
today <- Sys.Date()
format(today, format="%m %d %Y")
## [1] "02 27 2023"
format(today, format="%A")
## [1] "星期一"
format(today-1, format="%A")
## [1] "星期日"
#日期计算
startdate <- as.Date("2004-02-13")
enddate <- as.Date("2011-01-22") 
enddate - startdate
## Time difference of 2535 days
today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")
## Time difference of 3463.429 weeks
#将日期转换为字符型变量
strDates <- as.character(leadership$Date)
strDates
## [1] "2008-10-24" "2008-10-28" "2008-10-01" "2008-10-12" "2009-05-01"
a <- c(1,2,3)
is.numeric(a)
## [1] TRUE
is.vector(a)
## [1] TRUE
a <- as.character(a)
a
## [1] "1" "2" "3"
is.numeric(a)
## [1] FALSE
is.vector(a)
## [1] TRUE
is.character(a)
## [1] TRUE

1.7 时间类POSIXct和POSIXlt

POSIXct格式的时间:以有符号整数形式存储,表示从1970-01-01到该时间点经过的秒数; POSIXlt格式的时间:以字符串形式存储,包含年月日时间等。可以方便地提取年、月、日等变量

as.POSIXlt("2010/01/01")
## [1] "2010-01-01 CST"
as.POSIXlt("2010/01/01 10:30:30")
## [1] "2010-01-01 10:30:30 CST"
tt <- Sys.time()
as.POSIXlt(3600 * 24 * 1001, origin = tt)
## [1] "2025-11-24 17:52:23 CST"
as.POSIXlt(3600 * 24 * 1001, origin = tt, tz = "UTC")
## [1] "2025-11-24 09:52:23 UTC"
as.POSIXlt("10:30:30 2010/01/01", format = "%H:%M:%S %Y/%m/%d")
## [1] "2010-01-01 10:30:30 CST"
(ct <- as.POSIXct("1980-09-30 10:11:12"))
## [1] "1980-09-30 10:11:12 CST"
(lt <- as.POSIXlt("1980-09-30 10:11:12"))
## [1] "1980-09-30 10:11:12 CST"
unlist(lt)
##    sec    min   hour   mday    mon   year   wday   yday  isdst   zone gmtoff 
##   "12"   "11"   "10"   "30"    "8"   "80"    "2"  "273"    "0"  "CST"     NA
unlist(ct)
## [1] "1980-09-30 10:11:12 CST"
c(1900 + lt$year, lt$mon+1, lt$mday, lt$hour, lt$min, lt$sec)
## [1] 1980    9   30   10   11   12

1.8 时区

Sys.timezone()
## [1] "Asia/Taipei"
Sys.setenv(TZ ="Asia/Shanghai")
Sys.timezone()
## [1] "Asia/Shanghai"
Sys.getenv("TZ")
## [1] "Asia/Shanghai"
t <- as.POSIXlt("2008-04-08 7:10:15", format="%Y-%m-%d %H:%M:%S")
t
## [1] "2008-04-08 07:10:15 CST"
#不同时区的自动转换:
t <-  lubridate::as_datetime(t, tz= "UTC")
t
## [1] "2008-04-07 23:10:15 UTC"

1.9 lubridate包的使用

library(lubridate)
## 
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
ymd("20110604")
## [1] "2011-06-04"
mdy("06-04-2011")
## [1] "2011-06-04"
dmy("04/06/2011")
## [1] "2011-06-04"
arrive <- ymd_hms("2011-06-04 12:00:00", tz = "Asia/Shanghai")
arrive
## [1] "2011-06-04 12:00:00 CST"
leave <- ymd_hms("2011-08-10 14:00:00", tz = "Asia/Shanghai")
leave
## [1] "2011-08-10 14:00:00 CST"
second(arrive)
## [1] 0
second(arrive) <- 25
arrive
## [1] "2011-06-04 12:00:25 CST"
second(arrive) <- 0
wday(arrive)
## [1] 7
wday(arrive, label = TRUE)
## [1] 周六
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
meeting <- ymd_hms("2011-07-01 09:00:00", tz = "Asia/Shanghai")
with_tz(meeting, "GMT")
## [1] "2011-07-01 01:00:00 GMT"
meeting
## [1] "2011-07-01 09:00:00 CST"
(mistake <- force_tz(meeting, "UTC"))
## [1] "2011-07-01 09:00:00 UTC"
with_tz(mistake, "Asia/Shanghai")
## [1] "2011-07-01 17:00:00 CST"
meeting <- ymd_hms("2011-07-01 09:00:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
with_tz(meeting, "Asia/Shanghai")
## [1] "2011-07-01 17:00:00 CST"
meeting <- ymd_hm("2011-07-01 09:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- ymd_hm("11-07-01 09:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- ymd_h("2011-07-01 09")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hms("01-07-2011 09:00:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hms("01-07-11 09:00:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hm("01-07-11 09:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_h("01-07-11 09")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hm("010711 09:00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hm("01/07/11 09/00")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- mdy_hms("070111 09:00:15")
meeting
## [1] "2011-07-01 09:00:15 UTC"
meeting <- mdy_hm("0701110915")
meeting
## [1] "2011-07-01 09:15:00 UTC"
meeting <- mdy_h("07011109")
meeting
## [1] "2011-07-01 09:00:00 UTC"
meeting <- ydm_hms("11-01-07 09:15:00")
meeting
## [1] "2011-07-01 09:15:00 UTC"
meeting <- ydm_hm("11-01-07 09:15")
meeting
## [1] "2011-07-01 09:15:00 UTC"
meeting <- ydm_h("11010709")
meeting
## [1] "2011-07-01 09:00:00 UTC"
#Time Intervals

interval1 <- interval(arrive, leave)
interval1
## [1] 2011-06-04 12:00:00 CST--2011-08-10 14:00:00 CST
interval1 <- arrive %--% leave
interval1
## [1] 2011-06-04 12:00:00 CST--2011-08-10 14:00:00 CST
str(interval1)
## Formal class 'Interval' [package "lubridate"] with 3 slots
##   ..@ .Data: num 5796000
##   ..@ start: POSIXct[1:1], format: "2011-06-04 12:00:00"
##   ..@ tzone: chr "Asia/Shanghai"
interval2 <- interval(ymd(20110720, tz = "Asia/Shanghai"), ymd(20110831, tz = "Asia/Shanghai"))
interval2
## [1] 2011-07-20 CST--2011-08-31 CST
int_overlaps(interval2, interval1)
## [1] TRUE
#[1] TRUE

setdiff(interval1, interval2)
## [1] 2011-06-04 12:00:00 CST--2011-07-20 CST
setdiff(interval2, interval1)
## [1] 2011-08-10 14:00:00 CST--2011-08-31 CST
#Arithmetic with date times

minutes(2) ## period
## [1] "2M 0S"
dminutes(2) ## duration
## [1] "120s (~2 minutes)"
leap_year(2011) ## regular year
## [1] FALSE
# [1] FALSE

ymd(20110101) + dyears(1)
## [1] "2012-01-01 06:00:00 UTC"
ymd(20110101) + years(1)
## [1] "2012-01-01"
leap_year(2012) ## leap year
## [1] TRUE
#> [1] TRUE

meeting
## [1] "2011-07-01 09:00:00 UTC"
(meetings <- meeting + weeks(0:5))
## [1] "2011-07-01 09:00:00 UTC" "2011-07-08 09:00:00 UTC"
## [3] "2011-07-15 09:00:00 UTC" "2011-07-22 09:00:00 UTC"
## [5] "2011-07-29 09:00:00 UTC" "2011-08-05 09:00:00 UTC"
meetings %within% interval2
## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE
#[1] FALSE FALSE FALSE  TRUE  TRUE  TRUE

interval1 / ddays(1)
## [1] 67.08333
#> [1] 67.08333
interval1 / ddays(2)
## [1] 33.54167
#> [1] 33.54167
interval1 / dminutes(1)
## [1] 96600
#> [1] 96600

interval1
## [1] 2011-06-04 12:00:00 CST--2011-08-10 14:00:00 CST
interval1 %/% months(1)
## [1] 2
#> [1] 2

interval1 %% months(1)
## [1] 2011-08-04 12:00:00 CST--2011-08-10 14:00:00 CST
as.period(interval1 %% months(1))
## [1] "6d 2H 0M 0S"
#> [1] "6d 2H 0M 0S"

as.period(interval1)
## [1] "2m 6d 2H 0M 0S"
#> [1] "2m 6d 2H 0M 0S"

jan31 <- ymd("2013-01-31")
jan31 + months(0:11)
##  [1] "2013-01-31" NA           "2013-03-31" NA           "2013-05-31"
##  [6] NA           "2013-07-31" "2013-08-31" NA           "2013-10-31"
## [11] NA           "2013-12-31"
#>  [1] "2013-01-31" NA           "2013-03-31" NA           "2013-05-31"
#>  [6] NA           "2013-07-31" "2013-08-31" NA           "2013-10-31"
#> [11] NA           "2013-12-31"

floor_date(jan31, "month")
## [1] "2013-01-01"
floor_date(jan31, "month") + months(0:11) + days(31)
##  [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01"
##  [6] "2013-07-02" "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01"
## [11] "2013-12-02" "2014-01-01"
#>  [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01"
#>  [6] "2013-07-02" "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01"
#> [11] "2013-12-02" "2014-01-01"

jan31 %m+% months(0:11)
##  [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31"
##  [6] "2013-06-30" "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31"
## [11] "2013-11-30" "2013-12-31"
#>  [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31"
#>  [6] "2013-06-30" "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31"
#> [11] "2013-11-30" "2013-12-31"


(jan02 <- ymd("2013-01-02"))
## [1] "2013-01-02"
ceiling_date(jan02, "month")
## [1] "2013-02-01"
ceiling_date(jan02, "year")
## [1] "2014-01-01"
ceiling_date(jan02, "day")
## [1] "2013-01-03"
last_day <- function(date) {
  ceiling_date(date, "month") - days(1)
}

(jan02 <- ymd("2013-01-02"))
## [1] "2013-01-02"
last_day(jan02)
## [1] "2013-01-31"

1.10 数据排序

sorting data: order( )

leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>
order(leadership$age)  #返回的是排序序号
## [1] 3 1 4 2 5
newdata <- leadership[ order(leadership$age),  ]
newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 3   4.0  Young
## 1   4.5  Young
## 4   3.0  Young
## 2   4.0  Young
## 5   2.0   <NA>
newdata[2,]
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
##   meanx agecat
## 1   4.5  Young
#先按照性别,再按照年龄排队:
attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, country, gender
  newdata <- leadership[order(gender, age),]
detach(leadership)
newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
##   meanx agecat
## 3   4.0  Young
## 2   4.0  Young
## 5   2.0   <NA>
## 1   4.5  Young
## 4   3.0  Young
#倒序:
attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, country, gender
  newdata <-leadership[order(gender, -age),]
detach(leadership)
newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
##   meanx agecat
## 5   2.0   <NA>
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 1   4.5  Young

1.11 数据集的合并 Merging datasets

添加列—横向合并两个数据框(数据集): total <- merge(dataframeA, dataframeB, by=“ID”) total <- merge(dataframeA, dataframeB, by=c(“ID”,“Country”))

如果要直接横向合并两个矩阵或数据框,并且不需要指定一个公共索引,那么可以直接使用cbind()函数: total <- cbind(A, B)

添加行—纵向合并两个数据框(数据集), 使用rbind()函数: total <- rbind(dataframeA, dataframeB)

leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>
newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
##   meanx agecat
## 5   2.0   <NA>
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 1   4.5  Young
total <- merge(leadership, newdata, by="managerID")
total
##   managerID     Date.x country.x gender.x age.x item1.x item2.x item3.x item4.x
## 1         1 2008-10-24        US        M    32       5       4       5       5
## 2         2 2008-10-28        US        F    45       3       5       2       5
## 3         3 2008-10-01        UK        F    25       3       5       5       5
## 4         4 2008-10-12        UK        M    39       3       3       4      NA
## 5         5 2009-05-01        UK        F    NA       2       2       1       2
##   item5.x sumx.x meanx.x agecat.x     Date.y country.y gender.y age.y item1.y
## 1       5      9     4.5    Young 2008-10-24        US        M    32       5
## 2       5      8     4.0    Young 2008-10-28        US        F    45       3
## 3       2      8     4.0    Young 2008-10-01        UK        F    25       3
## 4      NA      6     3.0    Young 2008-10-12        UK        M    39       3
## 5       1      4     2.0     <NA> 2009-05-01        UK        F    NA       2
##   item2.y item3.y item4.y item5.y sumx.y meanx.y agecat.y
## 1       4       5       5       5      9     4.5    Young
## 2       5       2       5       5      8     4.0    Young
## 3       5       5       5       2      8     4.0    Young
## 4       3       4      NA      NA      6     3.0    Young
## 5       2       1       2       1      4     2.0     <NA>
total <- cbind(leadership, newdata)
total   #简单靠一起, 名字发生重复
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat managerID       Date country gender age item1 item2 item3 item4
## 1   4.5  Young         5 2009-05-01      UK      F  NA     2     2     1     2
## 2   4.0  Young         2 2008-10-28      US      F  45     3     5     2     5
## 3   4.0  Young         3 2008-10-01      UK      F  25     3     5     5     5
## 4   3.0  Young         4 2008-10-12      UK      M  39     3     3     4    NA
## 5   2.0   <NA>         1 2008-10-24      US      M  32     5     4     5     5
##   item5 sumx meanx agecat
## 1     1    4   2.0   <NA>
## 2     5    8   4.0  Young
## 3     2    8   4.0  Young
## 4    NA    6   3.0  Young
## 5     5    9   4.5  Young
names(total)[11:15] <- c("manager1", "Date1", "Country1", "gender1", "age1")
total
##   managerID       Date country gender age item1 item2 item3 item4 item5
## 1         1 2008-10-24      US      M  32     5     4     5     5     5
## 2         2 2008-10-28      US      F  45     3     5     2     5     5
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1
##   manager1 Date1 Country1 gender1       age1 country gender age item1 item2
## 1        9   4.5    Young       5 2009-05-01      UK      F  NA     2     2
## 2        8   4.0    Young       2 2008-10-28      US      F  45     3     5
## 3        8   4.0    Young       3 2008-10-01      UK      F  25     3     5
## 4        6   3.0    Young       4 2008-10-12      UK      M  39     3     3
## 5        4   2.0     <NA>       1 2008-10-24      US      M  32     5     4
##   item3 item4 item5 sumx meanx agecat
## 1     1     2     1    4   2.0   <NA>
## 2     2     5     5    8   4.0  Young
## 3     5     5     2    8   4.0  Young
## 4     4    NA    NA    6   3.0  Young
## 5     5     5     5    9   4.5  Young
total <- rbind(leadership, newdata)

total
##    managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1          1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2          2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3          3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4          4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5          5 2009-05-01      UK      F  NA     2     2     1     2     1    4
## 51         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
## 21         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 31         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 41         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 11         1 2008-10-24      US      M  32     5     4     5     5     5    9
##    meanx agecat
## 1    4.5  Young
## 2    4.0  Young
## 3    4.0  Young
## 4    3.0  Young
## 5    2.0   <NA>
## 51   2.0   <NA>
## 21   4.0  Young
## 31   4.0  Young
## 41   3.0  Young
## 11   4.5  Young

1.12 数据集取子集

从leadership数据框中选择了变量q1、q2、q3、q4和q5,并将它们保存到了数据框newdata中: 一下三种方式功能一样的:

leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>
newdata <- leadership[, c(6:10)]
newdata
##   item1 item2 item3 item4 item5
## 1     5     4     5     5     5
## 2     3     5     2     5     5
## 3     3     5     5     5     2
## 4     3     3     4    NA    NA
## 5     2     2     1     2     1
newdata <- leadership[c(6:10)]
newdata
##   item1 item2 item3 item4 item5
## 1     5     4     5     5     5
## 2     3     5     2     5     5
## 3     3     5     5     5     2
## 4     3     3     4    NA    NA
## 5     2     2     1     2     1
myvars <- c("item1", "item2", "item3", "item4", "item5") 

newdata <-leadership[myvars]
newdata
##   item1 item2 item3 item4 item5
## 1     5     4     5     5     5
## 2     3     5     2     5     5
## 3     3     5     5     5     2
## 4     3     3     4    NA    NA
## 5     2     2     1     2     1
myvars <- paste("item", 1:5, sep="") 
newdata <- leadership[myvars]
newdata
##   item1 item2 item3 item4 item5
## 1     5     4     5     5     5
## 2     3     5     2     5     5
## 3     3     5     5     5     2
## 4     3     3     4    NA    NA
## 5     2     2     1     2     1

1.13 剔除(丢弃)变量

myvars <- names(leadership) %in% c("item3", "item4") 
myvars
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [13] FALSE
newdata <- leadership[ !myvars ]
newdata
##   managerID       Date country gender age item1 item2 item5 sumx meanx agecat
## 1         1 2008-10-24      US      M  32     5     4     5    9   4.5  Young
## 2         2 2008-10-28      US      F  45     3     5     5    8   4.0  Young
## 3         3 2008-10-01      UK      F  25     3     5     2    8   4.0  Young
## 4         4 2008-10-12      UK      M  39     3     3    NA    6   3.0  Young
## 5         5 2009-05-01      UK      F  NA     2     2     1    4   2.0   <NA>
#在知道q3和q4是第8个和第9个变量的情况下:
newdata <- leadership[ c(-8, -9) ]
newdata
##   managerID       Date country gender age item1 item2 item5 sumx meanx agecat
## 1         1 2008-10-24      US      M  32     5     4     5    9   4.5  Young
## 2         2 2008-10-28      US      F  45     3     5     5    8   4.0  Young
## 3         3 2008-10-01      UK      F  25     3     5     2    8   4.0  Young
## 4         4 2008-10-12      UK      M  39     3     3    NA    6   3.0  Young
## 5         5 2009-05-01      UK      F  NA     2     2     1    4   2.0   <NA>
#或者直接赋值NULL :
leadership$q3 <- leadership$q4 <- NULL
leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>

1.14 选入观测(行)

  leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
## 4   3.0  Young
## 5   2.0   <NA>
  newdata <- leadership[1:3,]
  newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
##   meanx agecat
## 1   4.5  Young
## 2   4.0  Young
## 3   4.0  Young
  newdata <- leadership[which(leadership$gender=="M" &
                                leadership$age > 30),  ]
  newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
##   meanx agecat
## 1   4.5  Young
## 4   3.0  Young
  attach(leadership)
## The following objects are masked _by_ .GlobalEnv:
## 
##     age, country, gender
   newdata <- leadership[which(gender=='M' & age > 30),  ]
  detach(leadership)
  
  leadership$date <- as.Date(leadership$Date, "%m/%d/%y")
  startdate <- as.Date("2009-01-01")
  enddate <- as.Date("2009-10-31")
  newdata <- leadership[which(leadership$date >= startdate &
                                leadership$date <= enddate),  ]
  newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat       date
## 5     2   <NA> 2009-05-01
x <- c(1,2,3,4,5,6,7,8,9,10)
x[c(TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE)]
## [1]  1 10
#1 10

leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat       date
## 1   4.5  Young 2008-10-24
## 2   4.0  Young 2008-10-28
## 3   4.0  Young 2008-10-01
## 4   3.0  Young 2008-10-12
## 5   2.0   <NA> 2009-05-01
newdata <- leadership[ leadership$date >= startdate & leadership$date <= enddate,  ]
newdata
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat       date
## 5     2   <NA> 2009-05-01

subset

#在下面第一个示例中,选择了所有age值大于等于35或age值小于24的行,保留了变量q1、q2:
  leadership  
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat       date
## 1   4.5  Young 2008-10-24
## 2   4.0  Young 2008-10-28
## 3   4.0  Young 2008-10-01
## 4   3.0  Young 2008-10-12
## 5   2.0   <NA> 2009-05-01
  newdata <- subset(leadership, age >= 35 | age < 24, select=c(item1, item2))
  newdata
##   item1 item2
## 2     3     5
## 4     3     3
#在下面第二个示例中,选择了所有25岁以上的男性,并保留了变量gender到q2(gender、q4和其间所有列):
 leadership
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 2         2 2008-10-28      US      F  45     3     5     2     5     5    8
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 5         5 2009-05-01      UK      F  NA     2     2     1     2     1    4
##   meanx agecat       date
## 1   4.5  Young 2008-10-24
## 2   4.0  Young 2008-10-28
## 3   4.0  Young 2008-10-01
## 4   3.0  Young 2008-10-12
## 5   2.0   <NA> 2009-05-01
 newdata <- subset(leadership, gender=="M" & age > 25, select=gender:item2)
 newdata  
##   gender age item1 item2
## 1      M  32     5     4
## 4      M  39     3     3

1.15 数据抽取sample

#从leadership数据集中随机抽取一个大小为3的样本:
 mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]
 
#sample 函数,  replace=FALSE 表示不放回
 mysample
##   managerID       Date country gender age item1 item2 item3 item4 item5 sumx
## 1         1 2008-10-24      US      M  32     5     4     5     5     5    9
## 4         4 2008-10-12      UK      M  39     3     3     4    NA    NA    6
## 3         3 2008-10-01      UK      F  25     3     5     5     5     2    8
##   meanx agecat       date
## 1   4.5  Young 2008-10-24
## 4   3.0  Young 2008-10-12
## 3   4.0  Young 2008-10-01

1.16 使用SQL 语句操作数据框 (可选内容)

# install.packages("sqldf")
 library(sqldf)
## 载入需要的程辑包:gsubfn
## 载入需要的程辑包:proto
## 载入需要的程辑包:RSQLite
 mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
 newdf <- sqldf("select * from mtcars where carb=1 order by mpg", row.names=TRUE)
 newdf
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Valiant        18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
 sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
 from mtcars where cyl in (4, 6) group by gear")
##    avg_mpg avg_disp gear
## 1 20.33333 201.0333    3
## 2 24.53333 123.0167    4
## 3 25.36667 120.1333    5

1.17 初级数据管理—Summary

基础知识: R存储缺失值和日期值的方式,处理方法; 如何确定一个对象的数据类型,如何将它转换为其他类型; 使用简单的公式创建了新变量并重编码了现有变量; 展示了如何对数据进行排序和对变量进行重命名; 如何对数据集进行横向合并(添加变量)和纵向合并(添加观测); 讨论了如何保留或丢弃变量,以及如何基于一系列的准则选取观测。

1.17.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1

1.17.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.2

2 高级数据管理

2.1 数学和统计函数

 x = c( 0 : 360 )
 y = sin( x/180 * pi )
 plot(x, y, type="p", cex = 0.5,  las=1, col="blue", 
      cex.main=2.0, cex.axis = 1.0, cex.lab = 2.0, 
      main="Sine Function",  xlab="Degree", ylab="y = sin(x)" )
 abline ( h = 0.0,  lty = 2 )

# 统计函数

 xm <- mean(x)
 ym <- mean(y, trim=0.05, 
            na.rm = TRUE)
# 截尾平均数,即丢弃了最大5%和最小5%的数据和所有缺失值后的算术平均数

# 复杂语句:
 x <- c(1:8)
 n <- length(x)
 meanx <- sum(x)/n
 css <- sum(( x - meanx)^2)
 sdx <- sqrt(css/(n-1))
 
# 简洁语句:
 mean(x)
## [1] 4.5
 sd(x)
## [1] 2.44949
# 数据的标准化
# 在面试录取研究生的时候,有20位老师给100位学生打分,为了加快面试速度,可能又随机分了组,比如分两组,各10位老师给50位学生打分;
# 这里边如果直接平均老师的分数,存在一些非常不合理的问题,每个老师打分的最小、最大、标准差、均值都不一样,所以要对数据作标准化(Standardizing data):
 
score1 <- c(92, 95, 95, 89, 79, 60, 78, 50, 30, 80, 76, 87)
score2 <- c(95, 95, 89, 79, 60, 78, 50, 30, 80, 76, 87, 67)
score3 <- c(95, 89, 79, 60, 78, 50, 30, 80, 76, 87, 57, 48)
score4 <- c(89, 79, 60, 78, 50, 30, 80, 76, 87, 49, 87, 67)
score5 <- c(99, 98, 96, 99, 89, 90, 88, 90, 80, 90, 96, 97)
mydata <- data.frame(score1, score2, score3, score4, score5)
mydata
##    score1 score2 score3 score4 score5
## 1      92     95     95     89     99
## 2      95     95     89     79     98
## 3      95     89     79     60     96
## 4      89     79     60     78     99
## 5      79     60     78     50     89
## 6      60     78     50     30     90
## 7      78     50     30     80     88
## 8      50     30     80     76     90
## 9      30     80     76     87     80
## 10     80     76     87     49     90
## 11     76     87     57     87     96
## 12     87     67     48     67     97
newdata <- scale(mydata)           #均值为0, 标准差为1
newdata
##             score1     score2     score3     score4     score5
##  [1,]  0.809248084  1.0945466  1.3109797  1.0630748  1.0969655
##  [2,]  0.960195913  1.0945466  1.0074732  0.5225283  0.9237604
##  [3,]  0.960195913  0.7842814  0.5016289 -0.5045101  0.5773503
##  [4,]  0.658300255  0.2671728 -0.4594752  0.4684737  1.0969655
##  [5,]  0.155140824 -0.7153336  0.4510445 -1.0450566 -0.6350853
##  [6,] -0.800862094  0.2154619 -0.9653195 -2.1261497 -0.4618802
##  [7,]  0.104824881 -1.2324423 -1.9770080  0.5765830 -0.8082904
##  [8,] -1.304021524 -2.2666595  0.5522133  0.3603643 -0.4618802
##  [9,] -2.310340385  0.3188837  0.3498756  0.9549655 -2.1939310
## [10,]  0.205456767  0.1120402  0.9063043 -1.0991113 -0.4618802
## [11,]  0.004192995  0.6808597 -0.6112285  0.9549655  0.5773503
## [12,]  0.557668369 -0.3533576 -1.0664883 -0.1261275  0.7505553
## attr(,"scaled:center")
##   score1   score2   score3   score4   score5 
## 75.91667 73.83333 69.08333 69.33333 92.66667 
## attr(,"scaled:scale")
##    score1    score2    score3    score4    score5 
## 19.874416 19.338296 19.768930 18.499795  5.773503
attr(newdata,"scaled:scale")
##    score1    score2    score3    score4    score5 
## 19.874416 19.338296 19.768930 18.499795  5.773503
attr(newdata,"scaled:center")
##   score1   score2   score3   score4   score5 
## 75.91667 73.83333 69.08333 69.33333 92.66667
newdata <- scale(mydata)*10 + 50   #标准差10,均值50

newdata
##         score1   score2   score3   score4   score5
##  [1,] 58.09248 60.94547 63.10980 60.63075 60.96966
##  [2,] 59.60196 60.94547 60.07473 55.22528 59.23760
##  [3,] 59.60196 57.84281 55.01629 44.95490 55.77350
##  [4,] 56.58300 52.67173 45.40525 54.68474 60.96966
##  [5,] 51.55141 42.84666 54.51044 39.54943 43.64915
##  [6,] 41.99138 52.15462 40.34681 28.73850 45.38120
##  [7,] 51.04825 37.67558 30.22992 55.76583 41.91710
##  [8,] 36.95978 27.33340 55.52213 53.60364 45.38120
##  [9,] 26.89660 53.18884 53.49876 59.54966 28.06069
## [10,] 52.05457 51.12040 59.06304 39.00889 45.38120
## [11,] 50.04193 56.80860 43.88772 59.54966 55.77350
## [12,] 55.57668 46.46642 39.33512 48.73872 57.50555
## attr(,"scaled:center")
##   score1   score2   score3   score4   score5 
## 75.91667 73.83333 69.08333 69.33333 92.66667 
## attr(,"scaled:scale")
##    score1    score2    score3    score4    score5 
## 19.874416 19.338296 19.768930 18.499795  5.773503
res1 <- rowMeans(mydata)
res1
##  [1] 94.0 91.2 83.8 81.0 71.2 61.6 65.2 65.2 70.6 76.4 80.6 73.2
myres1 <- data.frame(mydata, res1)
myres1[ order(myres1$res1),  ]
##    score1 score2 score3 score4 score5 res1
## 6      60     78     50     30     90 61.6
## 7      78     50     30     80     88 65.2
## 8      50     30     80     76     90 65.2
## 9      30     80     76     87     80 70.6
## 5      79     60     78     50     89 71.2
## 12     87     67     48     67     97 73.2
## 10     80     76     87     49     90 76.4
## 11     76     87     57     87     96 80.6
## 4      89     79     60     78     99 81.0
## 3      95     89     79     60     96 83.8
## 2      95     95     89     79     98 91.2
## 1      92     95     95     89     99 94.0
myres1
##    score1 score2 score3 score4 score5 res1
## 1      92     95     95     89     99 94.0
## 2      95     95     89     79     98 91.2
## 3      95     89     79     60     96 83.8
## 4      89     79     60     78     99 81.0
## 5      79     60     78     50     89 71.2
## 6      60     78     50     30     90 61.6
## 7      78     50     30     80     88 65.2
## 8      50     30     80     76     90 65.2
## 9      30     80     76     87     80 70.6
## 10     80     76     87     49     90 76.4
## 11     76     87     57     87     96 80.6
## 12     87     67     48     67     97 73.2
res2 <- rowMeans(newdata) #行平均
myres2 <- data.frame(mydata, res2)
myres2[ order(myres2$res2),  ]
##    score1 score2 score3 score4 score5     res2
## 6      60     78     50     30     90 41.72250
## 7      78     50     30     80     88 43.32733
## 8      50     30     80     76     90 43.76003
## 9      30     80     76     87     80 44.23891
## 5      79     60     78     50     89 46.42142
## 10     80     76     87     49     90 49.32562
## 12     87     67     48     67     97 49.52450
## 11     76     87     57     87     96 53.21228
## 4      89     79     60     78     99 54.06287
## 3      95     89     79     60     96 54.63789
## 2      95     95     89     79     98 59.01701
## 1      92     95     95     89     99 60.74963
#对数据中某一变量标准化:
newdata <- transform(mydata, score5 = scale(score5)*10+50)
newdata
##    score1 score2 score3 score4   score5
## 1      92     95     95     89 60.96966
## 2      95     95     89     79 59.23760
## 3      95     89     79     60 55.77350
## 4      89     79     60     78 60.96966
## 5      79     60     78     50 43.64915
## 6      60     78     50     30 45.38120
## 7      78     50     30     80 41.91710
## 8      50     30     80     76 45.38120
## 9      30     80     76     87 28.06069
## 10     80     76     87     49 45.38120
## 11     76     87     57     87 55.77350
## 12     87     67     48     67 57.50555

2.2 概率密度函数(简称密度函数)

描述随机变量在某个确定的取值点附近的可能性的函数。

x <- pretty(33.5:39.5, 600)   #创建美观的分割点,将连续变量x分割为n个区间

ynorm <- dnorm(x, mean=36.5, sd = 0.8) 

y1.5     <- dnorm(x, mean=36.5, sd = 1.5) 

plot(ynorm ~ x, pch=1, col="DarkTurquoise", ylim=c(0,0.6), xlab="Temperature of the body", ylab="Density",main="Density Functions", xaxs="i", yaxs="i")
lines(x, y1.5, col="blue", lty=1, lwd=3)

x <- pretty(-3:3, 30)
ynorm <- dnorm(x) 
yt <- dt(x, df=1) 

plot(ynorm ~ x, pch=15, col="DarkTurquoise", ylim=c(0,0.4), xlab="Vector of Quantiles", ylab="Density",main="Density Distribution Functions")

plot(ynorm ~ x, pch=15, col="DarkTurquoise", ylim=c(0,0.4), xlab="Vector of Quantiles", ylab="Density",main="Density Distribution Functions", xaxs="i", yaxs="i")
lines(x, ynorm, col="DarkTurquoise", lty=1)
points(x, yt, pch=16,col="DeepPink",cex=1)
lines(x, yt, col="DeepPink", lty=2, lwd=2)
legend(1.8,0.39,c("Normal","Student T"), col=c("DarkTurquoise","DeepPink"), text.col=c("DarkTurquoise","DeepPink"), pch=c(15,16), lty=c(1,2))

pnorm(1.96)  #0.975    位于x=1.96左侧曲线下方的面积
## [1] 0.9750021
pt(1.96, 1)      #0.850
## [1] 0.8498286
x <- pretty(33.5:39.5, 600)
ynorm <- dnorm(x, mean=36.5, sd = 0.8) 
plot(ynorm~x, pch=1, col="DarkTurquoise", ylim=c(0,1.0), 
     xlab="Temperature of the body", ylab="Density",main="probability Density Functions (pdf) & Cumulative Distribution Function (CDF)", 
     xaxs="i", yaxs="i")
yp <- pnorm(x, mean=36.5, sd = 0.8)
lines(x, yp, col="blue", lwd=3)

qnorm(0.95, mean=36.5, sd=0.8)
## [1] 37.81588
qnorm(0.75, mean=36.5, sd=0.8)
## [1] 37.03959
qnorm(0.50, mean=36.5, sd=0.8)
## [1] 36.5
qnorm(0.25, mean=36.5, sd=0.8)
## [1] 35.96041

2.3 随机数

#生成50个均值为50,标准差为10的正态随机数
rnorm(100, mean=50, sd=10)
##   [1] 53.00942 38.16144 46.90055 28.15861 66.41598 48.23867 60.11290 43.71823
##   [9] 72.75052 63.67231 70.69437 57.63203 51.10008 64.82505 39.86845 55.29645
##  [17] 38.80570 46.36775 49.19441 38.64582 50.87240 51.36359 54.20151 60.72772
##  [25] 80.86780 56.62534 41.91201 52.79017 46.13687 49.17425 49.58915 60.63855
##  [33] 38.67791 47.77770 37.45722 38.53883 51.44932 47.19463 64.08087 45.62670
##  [41] 66.21964 52.01476 41.36608 38.85086 39.92983 46.09607 46.92721 52.76771
##  [49] 38.43757 46.49249 48.02060 58.02652 42.27039 35.49569 55.12285 58.04769
##  [57] 39.94541 61.87370 56.31730 56.29335 38.58363 58.14960 58.38641 59.75349
##  [65] 38.10810 33.42889 31.94336 50.76991 53.90309 47.86091 54.64006 61.66393
##  [73] 61.98280 63.75532 68.94953 42.36623 39.37098 34.09456 50.78561 52.80675
##  [81] 38.73806 57.76137 54.21911 53.25080 66.69240 60.65281 64.22991 46.10097
##  [89] 51.94213 54.19566 61.55201 57.36599 65.12016 54.86770 43.53554 40.89651
##  [97] 41.75186 53.46316 24.51178 71.76688
x <- seq(1:100)
y <- rnorm(100, mean=50, sd=10) 
plot(x, y, pch=17, col="blue", cex=1, 
     cex.axis=1.5, xlim=c(0, 100), ylim=c(20,80), main="rnorm(100, mean=50, sd=10)", xaxs="i", yaxs="i")
lines(x, y, lty=2, col="red", lwd=2)
library(Hmisc)
## 载入需要的程辑包:lattice
## 载入需要的程辑包:survival
## 载入需要的程辑包:Formula
## 载入需要的程辑包:ggplot2
## 
## 载入程辑包:'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
minor.tick(nx=2, ny=2, tick.ratio = 0.5)

2.4 设定随机数种子

在每次生成伪随机数的时候,函数都会使用一个不同的种子,因此也会产生不同的结果。 如果需要结果重现(reproducible),可通过函数set.seed()显式指定这个种子。 函数runif()用来生成0到1区间上服从均匀分布的伪随机数。

runif(5)
## [1] 0.9482966 0.1848350 0.2432801 0.6826676 0.4987312
runif(5)
## [1] 0.27944116 0.03861113 0.74238413 0.53629883 0.87030823
set.seed(1234)
runif(5)
## [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
set.seed(1234)
runif(5)
## [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154

2.5 将函数应用于矩阵和数据框

避免使用循环语句!

#R函数可以应用到一系列的数据对象上,包括标量、向量、矩阵、数组和数据框
a <- 5
sqrt(a)
## [1] 2.236068
b <- c(1.24, 5.65, 2.99)
sqrt(b)
## [1] 1.113553 2.376973 1.729162
round(b)
## [1] 1 6 3
c <- matrix(runif(12), nrow=3)
log(c)
##            [,1]       [,2]        [,3]       [,4]
## [1,] -0.4458019 -0.4063399 -0.60701566 -1.2299204
## [2,] -4.6569103 -0.6650435 -1.26325023 -0.1775781
## [3,] -1.4586478 -0.3658724 -0.07965651 -1.2509831
mean(c)
## [1] 0.493605
mydata <- matrix(rnorm(30), nrow=6)
mydata
##            [,1]        [,2]        [,3]         [,4]       [,5]
## [1,] -0.6224568 -1.68732684  0.37563561  0.011395161 0.54317290
## [2,] -0.7315360 -0.62743621  0.31026217  0.009859946 0.02142719
## [3,] -0.5166697  0.01831663  0.00500695  0.678271423 0.16256579
## [4,] -1.7507334  0.70524346 -0.03763026  1.029563029 1.24175425
## [5,]  0.8801042 -0.64701901  0.72397606 -1.729528504 0.78277748
## [6,]  1.3700104  0.86818087 -0.49673886 -2.204348095 0.04812071
#apply(x,  MARGIN, FUN, ...)
#x为数据对象,MARGIN是维度的下标,FUN是由你指定的函数,
#而...则包括了任何想传递给FUN的参数。MARGIN=1表示行,MARGIN=2表示列。
apply(mydata,  1,  mean)
## [1] -0.275915992 -0.203484580  0.069498214  0.237639408  0.002062036
## [6] -0.082955004
apply(mydata,  2,  mean)
## [1] -0.2285469 -0.2283402  0.1467519 -0.3674645  0.4666364
apply(mydata,  2,  mean,   trim=0.2)
## [1] -0.2476396 -0.1377238  0.1633186 -0.2575005  0.3841592
#(trim=0.2截去首尾各20%后平均)
#apply用于多维数组及自定义函数
myfun <- function(x){
  sum(x^2)
}
x<-array(1:24, c(2,3,4))
x
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12
## 
## , , 3
## 
##      [,1] [,2] [,3]
## [1,]   13   15   17
## [2,]   14   16   18
## 
## , , 4
## 
##      [,1] [,2] [,3]
## [1,]   19   21   23
## [2,]   20   22   24
apply(x, 1, sum)
## [1] 144 156
apply(x, 2, sum)
## [1]  84 100 116
apply(x, 3, sum)
## [1]  21  57  93 129
apply(x, 1, myfun)
## [1] 2300 2600
#apply, tapply, lapply, sapply的区别
iris
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 1            5.1         3.5          1.4         0.2     setosa
## 2            4.9         3.0          1.4         0.2     setosa
## 3            4.7         3.2          1.3         0.2     setosa
## 4            4.6         3.1          1.5         0.2     setosa
## 5            5.0         3.6          1.4         0.2     setosa
## 6            5.4         3.9          1.7         0.4     setosa
## 7            4.6         3.4          1.4         0.3     setosa
## 8            5.0         3.4          1.5         0.2     setosa
## 9            4.4         2.9          1.4         0.2     setosa
## 10           4.9         3.1          1.5         0.1     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 13           4.8         3.0          1.4         0.1     setosa
## 14           4.3         3.0          1.1         0.1     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 17           5.4         3.9          1.3         0.4     setosa
## 18           5.1         3.5          1.4         0.3     setosa
## 19           5.7         3.8          1.7         0.3     setosa
## 20           5.1         3.8          1.5         0.3     setosa
## 21           5.4         3.4          1.7         0.2     setosa
## 22           5.1         3.7          1.5         0.4     setosa
## 23           4.6         3.6          1.0         0.2     setosa
## 24           5.1         3.3          1.7         0.5     setosa
## 25           4.8         3.4          1.9         0.2     setosa
## 26           5.0         3.0          1.6         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 29           5.2         3.4          1.4         0.2     setosa
## 30           4.7         3.2          1.6         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 32           5.4         3.4          1.5         0.4     setosa
## 33           5.2         4.1          1.5         0.1     setosa
## 34           5.5         4.2          1.4         0.2     setosa
## 35           4.9         3.1          1.5         0.2     setosa
## 36           5.0         3.2          1.2         0.2     setosa
## 37           5.5         3.5          1.3         0.2     setosa
## 38           4.9         3.6          1.4         0.1     setosa
## 39           4.4         3.0          1.3         0.2     setosa
## 40           5.1         3.4          1.5         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 42           4.5         2.3          1.3         0.3     setosa
## 43           4.4         3.2          1.3         0.2     setosa
## 44           5.0         3.5          1.6         0.6     setosa
## 45           5.1         3.8          1.9         0.4     setosa
## 46           4.8         3.0          1.4         0.3     setosa
## 47           5.1         3.8          1.6         0.2     setosa
## 48           4.6         3.2          1.4         0.2     setosa
## 49           5.3         3.7          1.5         0.2     setosa
## 50           5.0         3.3          1.4         0.2     setosa
## 51           7.0         3.2          4.7         1.4 versicolor
## 52           6.4         3.2          4.5         1.5 versicolor
## 53           6.9         3.1          4.9         1.5 versicolor
## 54           5.5         2.3          4.0         1.3 versicolor
## 55           6.5         2.8          4.6         1.5 versicolor
## 56           5.7         2.8          4.5         1.3 versicolor
## 57           6.3         3.3          4.7         1.6 versicolor
## 58           4.9         2.4          3.3         1.0 versicolor
## 59           6.6         2.9          4.6         1.3 versicolor
## 60           5.2         2.7          3.9         1.4 versicolor
## 61           5.0         2.0          3.5         1.0 versicolor
## 62           5.9         3.0          4.2         1.5 versicolor
## 63           6.0         2.2          4.0         1.0 versicolor
## 64           6.1         2.9          4.7         1.4 versicolor
## 65           5.6         2.9          3.6         1.3 versicolor
## 66           6.7         3.1          4.4         1.4 versicolor
## 67           5.6         3.0          4.5         1.5 versicolor
## 68           5.8         2.7          4.1         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 70           5.6         2.5          3.9         1.1 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 72           6.1         2.8          4.0         1.3 versicolor
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 75           6.4         2.9          4.3         1.3 versicolor
## 76           6.6         3.0          4.4         1.4 versicolor
## 77           6.8         2.8          4.8         1.4 versicolor
## 78           6.7         3.0          5.0         1.7 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 80           5.7         2.6          3.5         1.0 versicolor
## 81           5.5         2.4          3.8         1.1 versicolor
## 82           5.5         2.4          3.7         1.0 versicolor
## 83           5.8         2.7          3.9         1.2 versicolor
## 84           6.0         2.7          5.1         1.6 versicolor
## 85           5.4         3.0          4.5         1.5 versicolor
## 86           6.0         3.4          4.5         1.6 versicolor
## 87           6.7         3.1          4.7         1.5 versicolor
## 88           6.3         2.3          4.4         1.3 versicolor
## 89           5.6         3.0          4.1         1.3 versicolor
## 90           5.5         2.5          4.0         1.3 versicolor
## 91           5.5         2.6          4.4         1.2 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 93           5.8         2.6          4.0         1.2 versicolor
## 94           5.0         2.3          3.3         1.0 versicolor
## 95           5.6         2.7          4.2         1.3 versicolor
## 96           5.7         3.0          4.2         1.2 versicolor
## 97           5.7         2.9          4.2         1.3 versicolor
## 98           6.2         2.9          4.3         1.3 versicolor
## 99           5.1         2.5          3.0         1.1 versicolor
## 100          5.7         2.8          4.1         1.3 versicolor
## 101          6.3         3.3          6.0         2.5  virginica
## 102          5.8         2.7          5.1         1.9  virginica
## 103          7.1         3.0          5.9         2.1  virginica
## 104          6.3         2.9          5.6         1.8  virginica
## 105          6.5         3.0          5.8         2.2  virginica
## 106          7.6         3.0          6.6         2.1  virginica
## 107          4.9         2.5          4.5         1.7  virginica
## 108          7.3         2.9          6.3         1.8  virginica
## 109          6.7         2.5          5.8         1.8  virginica
## 110          7.2         3.6          6.1         2.5  virginica
## 111          6.5         3.2          5.1         2.0  virginica
## 112          6.4         2.7          5.3         1.9  virginica
## 113          6.8         3.0          5.5         2.1  virginica
## 114          5.7         2.5          5.0         2.0  virginica
## 115          5.8         2.8          5.1         2.4  virginica
## 116          6.4         3.2          5.3         2.3  virginica
## 117          6.5         3.0          5.5         1.8  virginica
## 118          7.7         3.8          6.7         2.2  virginica
## 119          7.7         2.6          6.9         2.3  virginica
## 120          6.0         2.2          5.0         1.5  virginica
## 121          6.9         3.2          5.7         2.3  virginica
## 122          5.6         2.8          4.9         2.0  virginica
## 123          7.7         2.8          6.7         2.0  virginica
## 124          6.3         2.7          4.9         1.8  virginica
## 125          6.7         3.3          5.7         2.1  virginica
## 126          7.2         3.2          6.0         1.8  virginica
## 127          6.2         2.8          4.8         1.8  virginica
## 128          6.1         3.0          4.9         1.8  virginica
## 129          6.4         2.8          5.6         2.1  virginica
## 130          7.2         3.0          5.8         1.6  virginica
## 131          7.4         2.8          6.1         1.9  virginica
## 132          7.9         3.8          6.4         2.0  virginica
## 133          6.4         2.8          5.6         2.2  virginica
## 134          6.3         2.8          5.1         1.5  virginica
## 135          6.1         2.6          5.6         1.4  virginica
## 136          7.7         3.0          6.1         2.3  virginica
## 137          6.3         3.4          5.6         2.4  virginica
## 138          6.4         3.1          5.5         1.8  virginica
## 139          6.0         3.0          4.8         1.8  virginica
## 140          6.9         3.1          5.4         2.1  virginica
## 141          6.7         3.1          5.6         2.4  virginica
## 142          6.9         3.1          5.1         2.3  virginica
## 143          5.8         2.7          5.1         1.9  virginica
## 144          6.8         3.2          5.9         2.3  virginica
## 145          6.7         3.3          5.7         2.5  virginica
## 146          6.7         3.0          5.2         2.3  virginica
## 147          6.3         2.5          5.0         1.9  virginica
## 148          6.5         3.0          5.2         2.0  virginica
## 149          6.2         3.4          5.4         2.3  virginica
## 150          5.9         3.0          5.1         1.8  virginica
a <- apply(iris[, 1:4],  2,  mean)   #前四列数据,按列,求均值
a
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333
#tapply函数:将数据按照条件分组,生成类似列联表形式的数据结果
#tapply(X, INDEX, FUN = NULL, ..., default = NA, simplify = TRUE) 
#X:数组、矩阵、数据框等分割型数据向量
#INDEX:一个或多个因子的列表,每个因子的长度都与x相同
#FUN: 自定义的调用函数
tapply(leadership$age, leadership$country, mean)   # 求在不同country的age的均值
##   UK   US 
##   NA 38.5
#求在不同country和gender交叉水平下的age的均值, 输出得到矩阵数据
tapply(leadership$age, list(leadership$country, leadership$gender), mean) 
##     F  M
## UK NA 39
## US 45 32
#lapply函数: 对列表、数据框数据集进行循环,输入为列表,返回值为列表lapply(X, FUN, ...) 
#X:列表、数据框
#FUN:自定义的调用函数
b <- list(x = 1:10, y = matrix(1:12, 3, 4))
b
## $x
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $y
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12
lapply(b, sum)     # 求列表中各元素的和
## $x
## [1] 55
## 
## $y
## [1] 78
#sapply函数:  类似于lapply函数,但输入为列表,返回值为向量
#sapply(X, FUN, ..., )
#X:列表、矩阵、数据框
#FUN:自定义的调用函数
sapply(b, sum)    # 求列表中各元素的和
##  x  y 
## 55 78

2.6 数据处理示例:成绩评估

options(digits=2)

Student <- c("John Davis", "Angela Williams", 
             "Bullwinkle Moose", "David Jones", 
             "Janice Markhammer", "Cheryl Cushing",
             "Reuven Ytzrhak", "Greg Knox", "Joel England",
             "Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)

roster <- data.frame(Student, Math, Science, English,
                     stringsAsFactors=FALSE)
z <- scale(roster[,2:4]) 
score <- apply(z, 1, mean)                                            
score
##  [1]  0.56  0.92 -0.86 -1.16 -0.63  0.35 -1.05  1.34  0.70 -0.18
roster <- cbind(roster, score)  #简单将列靠一起
roster
##              Student Math Science English score
## 1         John Davis  502      95      25  0.56
## 2    Angela Williams  600      99      22  0.92
## 3   Bullwinkle Moose  412      80      18 -0.86
## 4        David Jones  358      82      15 -1.16
## 5  Janice Markhammer  495      75      20 -0.63
## 6     Cheryl Cushing  512      85      28  0.35
## 7     Reuven Ytzrhak  410      80      15 -1.05
## 8          Greg Knox  625      95      30  1.34
## 9       Joel England  573      89      27  0.70
## 10      Mary Rayburn  522      86      18 -0.18
y <- quantile(score,   c(.8, .6, .4, .2))                                   
y
##   80%   60%   40%   20% 
##  0.74  0.44 -0.36 -0.89
roster$grade[score >= y[1]] <- "A"                                     
roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"
roster$grade[score < y[4]] <- "F"

roster
##              Student Math Science English score grade
## 1         John Davis  502      95      25  0.56     B
## 2    Angela Williams  600      99      22  0.92     A
## 3   Bullwinkle Moose  412      80      18 -0.86     D
## 4        David Jones  358      82      15 -1.16     F
## 5  Janice Markhammer  495      75      20 -0.63     D
## 6     Cheryl Cushing  512      85      28  0.35     C
## 7     Reuven Ytzrhak  410      80      15 -1.05     F
## 8          Greg Knox  625      95      30  1.34     A
## 9       Joel England  573      89      27  0.70     B
## 10      Mary Rayburn  522      86      18 -0.18     C
name <- strsplit((roster$Student), " ")     #返回列表                             
name
## [[1]]
## [1] "John"  "Davis"
## 
## [[2]]
## [1] "Angela"   "Williams"
## 
## [[3]]
## [1] "Bullwinkle" "Moose"     
## 
## [[4]]
## [1] "David" "Jones"
## 
## [[5]]
## [1] "Janice"     "Markhammer"
## 
## [[6]]
## [1] "Cheryl"  "Cushing"
## 
## [[7]]
## [1] "Reuven"  "Ytzrhak"
## 
## [[8]]
## [1] "Greg" "Knox"
## 
## [[9]]
## [1] "Joel"    "England"
## 
## [[10]]
## [1] "Mary"    "Rayburn"
lastname  <- sapply(name,   "[",  2)         #  "[" 函数的应用
lastname
##  [1] "Davis"      "Williams"   "Moose"      "Jones"      "Markhammer"
##  [6] "Cushing"    "Ytzrhak"    "Knox"       "England"    "Rayburn"
firstname <- sapply(name,   "[",  1)
firstname
##  [1] "John"       "Angela"     "Bullwinkle" "David"      "Janice"    
##  [6] "Cheryl"     "Reuven"     "Greg"       "Joel"       "Mary"
roster <- cbind(firstname, lastname, roster[,-1])
roster
##     firstname   lastname Math Science English score grade
## 1        John      Davis  502      95      25  0.56     B
## 2      Angela   Williams  600      99      22  0.92     A
## 3  Bullwinkle      Moose  412      80      18 -0.86     D
## 4       David      Jones  358      82      15 -1.16     F
## 5      Janice Markhammer  495      75      20 -0.63     D
## 6      Cheryl    Cushing  512      85      28  0.35     C
## 7      Reuven    Ytzrhak  410      80      15 -1.05     F
## 8        Greg       Knox  625      95      30  1.34     A
## 9        Joel    England  573      89      27  0.70     B
## 10       Mary    Rayburn  522      86      18 -0.18     C
roster <- roster[order(lastname, firstname), ]

roster
##     firstname   lastname Math Science English score grade
## 6      Cheryl    Cushing  512      85      28  0.35     C
## 1        John      Davis  502      95      25  0.56     B
## 9        Joel    England  573      89      27  0.70     B
## 4       David      Jones  358      82      15 -1.16     F
## 8        Greg       Knox  625      95      30  1.34     A
## 5      Janice Markhammer  495      75      20 -0.63     D
## 3  Bullwinkle      Moose  412      80      18 -0.86     D
## 10       Mary    Rayburn  522      86      18 -0.18     C
## 2      Angela   Williams  600      99      22  0.92     A
## 7      Reuven    Ytzrhak  410      80      15 -1.05     F
#  "[" 是一个函数
x <- 1:10
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x[2]
## [1] 2
"["(x,2)
## [1] 2

2.7 控制流 Control flow

# 重复和循环

#for 结构
#for (var in seq) statement  如:
for (i in 1:10) print("Hello") 
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
#while结构
#while (cond) statement  如:
i <- 10
while( i>0 ) { print("Hello"); i <- i-1 }
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
#条件执行
#if-else结构
#if (cond) statement
#if (cond) statement1 else statement2

grade <- roster$grade
if (is.character(grade)) grade <- as.factor(grade)
grade
##  [1] C B B F A D D C A F
## Levels: A B C D F
grade <- roster$grade
if (!is.factor(grade)) grade <- as.factor(grade)  else print("Grade already is a factor")

#ifelse结构:  if-else结构的紧凑形式
#ifelse (cond,  statement1, statement2)
#与  if (cond) statement1 else statement2  功能相同
#举例:
score
##  [1]  0.56  0.92 -0.86 -1.16 -0.63  0.35 -1.05  1.34  0.70 -0.18
ifelse(score > 0.5, print("Passed"), print("Failed"))
## [1] "Passed"
## [1] "Failed"
##  [1] "Passed" "Passed" "Failed" "Failed" "Failed" "Failed" "Failed" "Passed"
##  [9] "Passed" "Failed"
outcome <- ifelse (score > 0.5, "Passed", "Failed")
outcome
##  [1] "Passed" "Passed" "Failed" "Failed" "Failed" "Failed" "Failed" "Passed"
##  [9] "Passed" "Failed"
#条件执行
#switch结构: 根据表达式的值,选择执行
#switch (expr, list) expr为表达式,其值或为一个整数值或为一个字符串;
#list为一个列表。参数list是有名定义时,则当expr等于元素名时,
#   返回变量名对应的值,否则没有返回值。举例:

feelings <- c("sad", "afraid")
for (i in feelings)
  print(
    switch(i,
           happy = "I am glad you are happy",
           afraid = "There is nothing to fear",
           sad = "Cheer up",
           angry = "Calm down now"
    )
  )
## [1] "Cheer up"
## [1] "There is nothing to fear"
feelings <- c(2, 4, 3, 1)
for (i in feelings)
  print(
    switch(i,
           "I am glad you are happy",
           "There is nothing to fear",
           "Cheer up",
           "Calm down now"
    )
  )
## [1] "There is nothing to fear"
## [1] "Calm down now"
## [1] "Cheer up"
## [1] "I am glad you are happy"
#条件执行常被应用于自定义函数
mystats <- function(x, parametric=TRUE, print=FALSE) 
{
  if (parametric) {  center <- mean(x);   spread <- sd(x)  } 
  else {    center <- median(x);    spread <- mad(x)    }
  
  if (print & parametric)  {     cat("Mean=", center, "\n", "SD=", spread, "\n")  } 
  else if (print & !parametric) {  cat("Median=", center, "\n", "MAD=", spread, "\n")  }
  result <- list(center=center, spread=spread)
  return(result)  }

# trying it out
set.seed(1234)
x <- rnorm(500) 
y <- mystats(x)
y <- mystats(x, parametric=FALSE, print=TRUE)
## Median= -0.021 
##  MAD= 1
# Another switch example
mydate <- function(type="long") {
  switch(type,
         long =  format(Sys.time(), "%A %B %d %Y"), 
         short = format(Sys.time(), "%m-%d-%y"),
         cat(type, "is not a recognized type\n"))
}
mydate("long")
## [1] "星期一 二月 27 2023"
mydate("short")
## [1] "02-27-23"
mydate()
## [1] "星期一 二月 27 2023"
mydate("medium")
## medium is not a recognized type

2.8 整合与重构 Aggregation and restructuring

#转置transpose 的实现: t( )

cars <- mtcars[1:5, 1:4]
cars
##                   mpg cyl disp  hp
## Mazda RX4          21   6  160 110
## Mazda RX4 Wag      21   6  160 110
## Datsun 710         23   4  108  93
## Hornet 4 Drive     21   6  258 110
## Hornet Sportabout  19   8  360 175
t(cars)
##      Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
## mpg         21            21         23             21                19
## cyl          6             6          4              6                 8
## disp       160           160        108            258               360
## hp         110           110         93            110               175
#整合数据,非常重要。
#比如我们拥有逐小时的气象观测数据,试图得到每天的、每月的平均数据、全年逐时平均的日变化数据等等。
#在R中使用一个或多个by变量和一个预先定义好的函数来折叠(collapse)数据,达到整合的目的是非常容易的。
#调用格式为:
#aggregate(x, by, FUN)
#其中x是待折叠的数据对象,by是一个变量名组成的列表(依据它们归类形成新的观测量),而FUN则是用来计算描述性统计量的标量函数,它将被用来计算新观测中的值。


#按照具有相同(气缸cyl、档位gear)数值的观测进行归类,
#使用FUN=mean函数对所有变量数据进行平均,形成新的数据框aggdata

mtcars   
##                     mpg cyl disp  hp drat  wt qsec vs am gear carb
## Mazda RX4            21   6  160 110  3.9 2.6   16  0  1    4    4
## Mazda RX4 Wag        21   6  160 110  3.9 2.9   17  0  1    4    4
## Datsun 710           23   4  108  93  3.8 2.3   19  1  1    4    1
## Hornet 4 Drive       21   6  258 110  3.1 3.2   19  1  0    3    1
## Hornet Sportabout    19   8  360 175  3.1 3.4   17  0  0    3    2
## Valiant              18   6  225 105  2.8 3.5   20  1  0    3    1
## Duster 360           14   8  360 245  3.2 3.6   16  0  0    3    4
## Merc 240D            24   4  147  62  3.7 3.2   20  1  0    4    2
## Merc 230             23   4  141  95  3.9 3.1   23  1  0    4    2
## Merc 280             19   6  168 123  3.9 3.4   18  1  0    4    4
## Merc 280C            18   6  168 123  3.9 3.4   19  1  0    4    4
## Merc 450SE           16   8  276 180  3.1 4.1   17  0  0    3    3
## Merc 450SL           17   8  276 180  3.1 3.7   18  0  0    3    3
## Merc 450SLC          15   8  276 180  3.1 3.8   18  0  0    3    3
## Cadillac Fleetwood   10   8  472 205  2.9 5.2   18  0  0    3    4
## Lincoln Continental  10   8  460 215  3.0 5.4   18  0  0    3    4
## Chrysler Imperial    15   8  440 230  3.2 5.3   17  0  0    3    4
## Fiat 128             32   4   79  66  4.1 2.2   19  1  1    4    1
## Honda Civic          30   4   76  52  4.9 1.6   19  1  1    4    2
## Toyota Corolla       34   4   71  65  4.2 1.8   20  1  1    4    1
## Toyota Corona        22   4  120  97  3.7 2.5   20  1  0    3    1
## Dodge Challenger     16   8  318 150  2.8 3.5   17  0  0    3    2
## AMC Javelin          15   8  304 150  3.1 3.4   17  0  0    3    2
## Camaro Z28           13   8  350 245  3.7 3.8   15  0  0    3    4
## Pontiac Firebird     19   8  400 175  3.1 3.8   17  0  0    3    2
## Fiat X1-9            27   4   79  66  4.1 1.9   19  1  1    4    1
## Porsche 914-2        26   4  120  91  4.4 2.1   17  0  1    5    2
## Lotus Europa         30   4   95 113  3.8 1.5   17  1  1    5    2
## Ford Pantera L       16   8  351 264  4.2 3.2   14  0  1    5    4
## Ferrari Dino         20   6  145 175  3.6 2.8   16  0  1    5    6
## Maserati Bora        15   8  301 335  3.5 3.6   15  0  1    5    8
## Volvo 142E           21   4  121 109  4.1 2.8   19  1  1    4    2
options(digits=3)
attach(mtcars)
## The following object is masked from package:ggplot2:
## 
##     mpg
   aggdata <-aggregate(mtcars, by=list(cyl,  gear), FUN=mean, na.rm=TRUE)
detach(mtcars)
aggdata
##   Group.1 Group.2  mpg cyl disp  hp drat   wt qsec  vs   am gear carb
## 1       4       3 21.5   4  120  97 3.70 2.46 20.0 1.0 0.00    3 1.00
## 2       6       3 19.8   6  242 108 2.92 3.34 19.8 1.0 0.00    3 1.00
## 3       8       3 15.1   8  358 194 3.12 4.10 17.1 0.0 0.00    3 3.08
## 4       4       4 26.9   4  103  76 4.11 2.38 19.6 1.0 0.75    4 1.50
## 5       6       4 19.8   6  164 116 3.91 3.09 17.7 0.5 0.50    4 4.00
## 6       4       5 28.2   4  108 102 4.10 1.83 16.8 0.5 1.00    5 2.00
## 7       6       5 19.7   6  145 175 3.62 2.77 15.5 0.0 1.00    5 6.00
## 8       8       5 15.4   8  326 300 3.88 3.37 14.6 0.0 1.00    5 6.00
#重构和整合数据集工具包——reshape2

#install.packages("reshape2")

library(reshape2)

mydatatxt <- "
   ID Time  X1 X2
    1    1  5  6
    1    2  3  5
    2    1  6  1
    2    2  2  4"

mydata <-  read.table(header=TRUE, text = mydatatxt)
mydata
##   ID Time X1 X2
## 1  1    1  5  6
## 2  1    2  3  5
## 3  2    1  6  1
## 4  2    2  2  4
#数据融合(melting)
md <- melt(mydata, id=c("ID", "Time"))
md
##   ID Time variable value
## 1  1    1       X1     5
## 2  1    2       X1     3
## 3  2    1       X1     6
## 4  2    2       X1     2
## 5  1    1       X2     6
## 6  1    2       X2     5
## 7  2    1       X2     1
## 8  2    2       X2     4
#重铸 casting
#newdata <- cast(md, formula, FUN)
#md为已融合的数据,formula描述了想要的最后结果,而FUN是(可选的)数据整合函数

dcast(md, ID ~ variable, mean)
##   ID X1  X2
## 1  1  4 5.5
## 2  2  4 2.5
dcast(md, Time ~ variable, mean)
##   Time  X1  X2
## 1    1 5.5 3.5
## 2    2 2.5 4.5
dcast(md, ID ~ Time, mean)
##   ID   1 2
## 1  1 5.5 4
## 2  2 3.5 3
dcast(md, ID+Time ~ variable)
##   ID Time X1 X2
## 1  1    1  5  6
## 2  1    2  3  5
## 3  2    1  6  1
## 4  2    2  2  4
dcast(md, ID + variable ~ Time)
##   ID variable 1 2
## 1  1       X1 5 3
## 2  1       X2 6 5
## 3  2       X1 6 2
## 4  2       X2 1 4
dcast(md, ID ~ variable + Time)
##   ID X1_1 X1_2 X2_1 X2_2
## 1  1    5    3    6    5
## 2  2    6    2    1    4

2.8.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1

The End of Chapter Two #########################

3 作业

本次作业需要读入温度、相对湿度、PM2.5质量浓度数据(两个站点),一共4个文件,进行相应数据处理和绘图分析。 同学们可参考以下代码,有基础的同学也可尝试我们没讲到的内容,如ggplot包的绘图。

3.1 数据读取与处理

读取两站PM2.5数据,读取温度、湿度

read_data <- function(file_name, path='./data/', skip=3, header=TRUE, sep=',') {
  return (read.table(paste(path, file_name, sep=''), skip=skip, header=header, sep=sep))
}
networkpath <- 'http://162.105.145.16/rs/R_data_samples/'

#美使馆PM2.5观测数据:
aq_embassy <- read_data('AQ_FSPMC_USEmbassy.csv', networkpath, skip=3, header=TRUE)
#重命名aq_embassy数据的各列
colnames(aq_embassy) <- c('Time', 'Height', 'PMe', 'Source', 'Status',  'Method')
#时间从字符串转为POSIXlt格式,以便后续数据处理
aq_embassy$Time <- as.POSIXlt(aq_embassy$Time)

#万柳观测站PM2.5数据:
aq_wanliu <- read_data('AQ_FSPMC_WanLiu.csv', networkpath, skip=3, header=TRUE)
colnames(aq_wanliu) <- c('Time', 'Height', 'PMw', 'Source', 'Status',   'Method')
aq_wanliu$Time <- as.POSIXlt(aq_wanliu$Time)

rh <- read_data('A_RH.csv', networkpath, skip=3, header=TRUE)
colnames(rh) <- c('Time', 'Height', 'RH', 'Source', 'Status',   'Method')
rh$Time <- as.POSIXlt(rh$Time)

temp <- read_data('A_TEMP.csv', networkpath, skip=3, header=TRUE)
colnames(temp) <- c('Time', 'Height', 'Temperature', 'Source', 'ErrFlag',   'Details')
temp$Time <- as.POSIXlt(temp$Time)

3.1.1 数据抽取

利用 subset 函数抽取2019-2022年、数据源为GTS_METAR的数据 基本格式:subset( origin_data , condition ,select=c(colname1,colname2,…) )

#temp_sub <- dplyr::filter(temp, Time$year+1900 <= 2022)

temp_sub <- subset(temp, (Source == 'GTS_METAR') & (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'Temperature'))

rh_sub <- subset(rh, (Source == 'GTS_METAR') & (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'RH'))

embassy_sub <- subset(aq_embassy, (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'PMe'))

wanliu_sub  <- subset(aq_wanliu,  (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'PMw'))

3.1.2 数据重铸得到整小时数据

library(reshape2)

# 先获取每个DataFrame中小时的数据

# two <- merge(embassy_sub, wanliu_sub, by=c('Time'), all=TRUE)
# two1<- merge(embassy_sub, wanliu_sub, by.x = 'Time', by.y = 'Time', all=TRUE)
# two2<- merge(embassy_sub, wanliu_sub, by.embassy_sub = 'Time', by.wanliu_sub = 'Time')

mHourlyData <- data.frame(rh_sub)

#merge(data_A,data_B,by=c(colname1,colname2,...),all=___ )
#all=TRUE 对应:完整合并,将返回所有匹配与未匹配的数据

for (df in list(temp_sub, embassy_sub, wanliu_sub)) 
  {
     mHourlyData <- merge(mHourlyData, df, by=c('Time'), all=TRUE)
  }

#下一步处理既可以使用aggregate(),也可以调用reshape2包中的melt()与dcast()来完成

#新增一个'Hour'变量,对应以小时为单位计数的时间
mHourlyData$Hour <- format(mHourlyData$Time, '%Y/%m/%d %H:00:00')

#利用'Hour'变量融合,得到中间数据
melted_data <- melt(subset(mHourlyData, select=-c(Time)), id='Hour')

#将中间数据与'Hour'变量关联起来,并且进行'mean'取平均值,na.rm=TRUE在运算中排除了缺失值
mHourlyData_ <- dcast(melted_data, Hour~variable, mean, na.rm=TRUE)

mHourlyData_$Hour <- as.POSIXlt(mHourlyData_$Hour)

#把变量改名, 包名称plyr::很重要!
mHourlyData_ <- plyr::rename(mHourlyData_, c(Hour = "Time"))

mHourlyData <- mHourlyData_ 

rm(mHourlyData_)

整合后的小时级别数据如下

##                  Time   RH Temperature PMe PMw
## 1 2019-01-01 00:00:00 50.0        -9.5  61 NaN
## 2 2019-01-01 01:00:00 54.0       -11.0  57 NaN
## 3 2019-01-01 02:00:00 56.0       -12.0  52 NaN
## 4 2019-01-01 03:00:00 60.7       -13.0  52 NaN
## 5 2019-01-01 04:00:00 58.3       -12.5  46 NaN
## 6 2019-01-01 05:00:00 58.1       -13.5  48 NaN

3.1.3 重铸为每日平均数据

mHourlyData_ <- data.frame(mHourlyData)
mHourlyData_$Day <- substr(mHourlyData_$Time, 1, 10)

melted_data <- melt(subset(mHourlyData_, select=-c(Time)), id='Day')
mDailyData <- dcast(melted_data, Day~variable, mean, na.rm = TRUE )
mDailyData$Day <- as.POSIXlt(mDailyData$Day)

#Another way to replace the above 3 lines:  

#mHourlyData_$Date <- as.POSIXlt( mHourlyData$Time, format = "%Y/%m/%d %H:%M:%S")
#mDailyData <- aggregate(mHourlyData_[2:5], by = list(day = mHourlyData_$Day), FUN=mean, na.rm=TRUE)

日平均后的数据如下

##          Day   RH Temperature   PMe   PMw
## 1 2019-01-01 40.0       -5.90  33.1  16.0
## 2 2019-01-02 40.2       -6.04  54.7 100.4
## 3 2019-01-03 47.1       -5.63 128.2 111.0
## 4 2019-01-04 22.4       -2.04  20.2  21.7
## 5 2019-01-05 22.5       -4.25  13.0  16.9
## 6 2019-01-06 41.8       -7.02  56.5  57.7

mDailyData 数据中增加 PM2.5 差值 (大使馆-万柳)

mDailyData$aq_diff <- with (mDailyData, {
  PMe - PMw
})

根据国家标准在每日数据中,增加 PM2.5 等级因子变量

aq_factor <- function(mpc) {
  mpc_ <- replicate(1, mpc)
  #创建副本
  mpc_[mpc<=35] = 'excellent'
  mpc_[mpc>35 & mpc<=75] = 'good'
  mpc_[mpc>75 & mpc<=115] = 'lightly polluted'
  mpc_[mpc>115 & mpc<=150] = 'moderately polluted'
  mpc_[mpc>150 & mpc<=250] = 'heavily polluted'
  mpc_[mpc>250] = 'severely polluted'
  return (factor(mpc_, ordered=TRUE, levels=c('excellent', 'good', 'lightly polluted', 'moderately polluted', 'heavily polluted', 'severely polluted')))
}

mDailyData$embassy_lvl <- aq_factor(mDailyData$PMe)
mDailyData$wanliu_lvl <- aq_factor(mDailyData$PMw)

#可以将处理后的数据写入文件储存,便于以后直接从日均数据开始数据处理
write.table(mDailyData, "mDailyData.csv",  row.names=FALSE,  col.names=TRUE, sep="," )

3.1.4 计算月平均值

mMonthlyData <- subset(data.frame(mDailyData), select=-c(embassy_lvl, wanliu_lvl))
#mMonthlyData$Year  <- as.integer(substr(mMonthlyData$Day, 1, 4))
mMonthlyData$Month <- as.integer(substr(mMonthlyData$Day, 6, 7))
#月份为整数
#melted_data <- melt(subset(mMonthlyData, select=-c(Day)), id=list('Year', 'Month'))
melted_data <- melt(subset(mMonthlyData, select=-c(Day)), id=list('Month'))

#mMonthlyData <- dcast(melted_data, Year + Month ~ variable, mean, na.rm = TRUE)
mMonthlyData <- dcast(melted_data, Month ~ variable, mean, na.rm = TRUE)
#na.rm=TRUE 剔除缺省数据

#writing the data into a file 
write.table(mMonthlyData, "mMonthlyData.csv",  row.names=FALSE,  col.names=TRUE, sep="," )

月均数据结果如下

##    Month   RH Temperature  PMe  PMw aq_diff
## 1      1 45.7     -2.9222 51.8 49.9   0.450
## 2      2 46.8      0.0412 52.4 52.1  -0.181
## 3      3 44.9      8.3289 52.9 54.5  -1.592
## 4      4 41.2     15.0118 42.5 37.5   5.019
## 5      5 47.6     20.9927 39.6 28.3   9.987
## 6      6 57.3     25.7958 32.8 30.1   4.252
## 7      7 73.4     26.8397 26.6 29.0  -2.517
## 8      8 72.7     25.6338 21.4 22.7  -1.331
## 9      9 69.0     21.7591 30.0 27.4   2.610
## 10    10 61.0     12.4380 38.9 36.9   2.560
## 11    11 57.0      5.5273 44.9 42.7   2.250
## 12    12 46.3     -2.4578 30.6 29.8   0.820

3.1.5 全年平均“日变化”

利用dcast, melt 函数获得变量多年平均的日变化(只有24个小时)数据框

mHourlyData_ <- data.frame(mHourlyData)
mHourlyData_$Hour <- as.integer(substr(mHourlyData_$Time, 12, 13))
melted_data <- melt(subset(mHourlyData_, select=-c(Time)), id='Hour')

#add na.rm=TRUE
mDiurnalData <- dcast(melted_data, Hour~variable, mean, na.rm=TRUE)

mydf <- subset(mHourlyData, Time$hour==3, select=c(Temperature))
mean(mydf$Temperature,  na.rm = TRUE)
## [1] 8.82

所得日变化数据如下所示

##   Hour   RH Temperature  PMe  PMw
## 1    0 66.3       10.29 42.7 38.6
## 2    1 68.1        9.72 42.6 38.2
## 3    2 69.7        9.23 41.5 37.7
## 4    3 70.9        8.82 40.5 37.2
## 5    4 72.1        8.44 39.1 36.6
## 6    5 72.8        8.21 37.9 36.5

3.2 作图部分

颜色自定义: 胭脂红#F03F24 云山蓝#2F90B9 蔚蓝#29B7CB

3.2.1 日均浓度及差值

两观测站日均 PM2.5 浓度图:

opar<-par(pin=c(6,4), mar=c(4,4,3,1))

mDailyData$Day <- as.Date(mDailyData$Day)

x_label<-seq(from=as.Date("2019-01-01"), to=as.Date("2022-12-31"), by=365)

plot(mDailyData$Day, mDailyData$PMe, xaxt="n", yaxt="n", col=rgb(255, 100, 100, maxColorValue=255), type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0,  xlab='Date', ylab = "PM2.5(ug/m3)", ylim=c(0, 300), main="PM2.5 mass concentration in Beijing (2019/01/01 - 2022/12/31)")

lines(mDailyData$Day, mDailyData$PMw, pch=18, cex=0.8, type="b", lty=3, col=rgb(100, 100, 255, maxColorValue=255))

axis(2, at=seq(0, 300, 20), col.axis="black", las=1, cex.axis=0.8, tck=-0.01)
axis(1, x_label, format(x_label,"%Y-%m-%d"), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)

legend("topleft", ncol=1, inset = .02, c("US Embassy", "Wan Liu"), lty=c(1, 3), pch=c(20, 18), col=c(rgb(255, 100, 100, maxColorValue=255), rgb(100, 100, 255, maxColorValue=255)))

par(opar)

两观测站日均 PM2.5 浓度之差散点图:

opar<-par(pin=c(2.8,2.8))


diff <- na.omit(mDailyData$aq_diff)
rms <- sqrt(mean(diff**2))
tcor <- cor(mDailyData$PMw, mDailyData$PMe, use = "pairwise.complete.obs")

label <- sprintf('R = %.2f\nN = %d\nRMS=%.1f', tcor, length(diff), rms)
  
plot(mDailyData$PMw, mDailyData$PMe, asp = 1.0, col=rgb(255, 100, 100, maxColorValue=255), 
     type="b", lty=0, cex.main=1.0, pch=20, cex=1.0, cex.lab = 1.0,  
     xlab='Wan Liu', ylab = "US Embassy", xlim = c(0, 300), ylim=c(0, 300), 
     main="PM2.5 mass concentration differences\nbetween two stations", xaxs="i", yaxs="i")

#text(40,270, family="mono", label, cex = 0.8)
legend("topleft", ncol=1,inset=0.03, legend = c("R=0.95", "N=1038", "RMS=11.1"), cex=0.8)

abline(lm(mDailyData$PMw ~ mDailyData$PMe), lty =3, col = "blue", lwd = 2.0)

par(opar)

分析: 在多数情况,两测量点的污染物浓度差异并不大(差异绝对值在 20 以内), 但也存在个别天差异较大

大使馆和万柳污染物浓度数据呈现高度的相关性(相关系数为 \(R=0.95\) 已经标注在图中)

3.2.2 全年污染等级统计饼图

mDailyData_ <- subset(mDailyData, select=c(wanliu_lvl))
mDailyData_ <- na.omit(mDailyData_ )
#包plyr::指定很重要!
wanliu_cnt <- plyr::count(mDailyData_, 'wanliu_lvl')

mDailyData_ <- subset(mDailyData, select=c(embassy_lvl))
mDailyData_ <- na.omit(mDailyData_ )
embassy_cnt <- plyr::count(mDailyData_, 'embassy_lvl')

wanliu_cnt$pct  <- with(wanliu_cnt,  100*freq/sum(freq, na.rm=TRUE))
embassy_cnt$pct <- with(embassy_cnt, 100*freq/sum(freq, na.rm=TRUE))

aq_colors <- c('#00E400', '#FFFF00', '#FF7E00', '#FF0000', '#99004C', '#7E0023')

opar <- par(no.readonly=FALSE)
layout(matrix(c(4, 4, 1, 2, 3, 3), 3, 2, byrow=TRUE), widths=c(1, 1), heights=c(0.2, 1, 0.1))
par(mar=c(0, 0.5, 0, 0), cex.sub=1.4)
pie(wanliu_cnt$freq, sprintf('%.1f%%', wanliu_cnt$pct), col=aq_colors, clockwise=FALSE, radius=0.75)
title(sub='Wanliu', line=-2)
par(mar=c(0, 0, 0, 0.5), cex.sub=1.4)
pie(embassy_cnt$freq, sprintf('%.1f%%', embassy_cnt$pct), col=aq_colors, clockwise=FALSE, radius=0.75)
title(sub='US Embassy', line=-2)
plot.new()
par(mar=c(0, 0, 0, 0), cex=0.8)
legend(x='center', legend=levels(wanliu_cnt$wanliu_lvl), cex=0.7, fill=aq_colors, ncol=6)
plot.new()
par(mar=c(0, 0, 0, 0), cex.main=2)
title(main='Air Quality in Beijing',line=-4)

分析: 超过一半天数为优,约 1/3 天数为良,只有少于 1/6 天数存在轻度及以上的污染

3.2.3 月平均柱状图

mMonthlyData_ <- subset(mMonthlyData, select=c(Month, PMe, PMw))


opar <- par(no.readonly=TRUE)
par (mar=c(5,8,4,2))     #mar=c(bottom, left, top, right)

barplot(data = mMonthlyData_, cbind(PMw, PMe) ~ Month, beside = TRUE, col=c("green","yellow"), 
          names.arg= as.character(mMonthlyData_$Month), main="Monthly PM2.5 mass concentration in Beijing", 
          xlab="Date", ylab = "PM2.5",   border = "red")

par(opar)

分析: 高污染主要集中在冬季, 而夏季空气质量相对较好

3.2.4 多年平均温度日变化图

opar<-par(pin=c(6,4), mar=c(4,4,3,1))

plot(mDiurnalData$Hour, mDiurnalData$Temperature, xaxt="n", yaxt="n", col=rgb(255, 100, 100, maxColorValue=255), type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0,  xlab='Time', ylab = "Temperature", xlim = c(0, 24), ylim=c(5, 20), main="Diurnal Change of Temperature in Beijing")

axis(2, at=seq(5, 20, 1), col.axis="black", las=1, cex.axis=0.8, tck=-0.01)
axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)

par(opar)

3.2.5 多年平均相对湿度日变化图

opar<-par(pin=c(6,4), mar=c(4,4,3,1))

plot(mDiurnalData$Hour, mDiurnalData$RH, xaxt="n", yaxt="n", col="#2233FF", type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0,  xlab='Time', ylab = "Relative HUmidity", xlim = c(0, 23), ylim=c(30, 80), main="Diurnal Change of Relative HUmidity in Beijing")

axis(2, at=seq(30, 80, 5), col.axis="black", las=1, cex.axis=0.8, tck=-0.01)
axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)

par(opar)

分析: 日均相对湿度最大值出现在 5 时(与气温最低时间较为接近), 最低值出现在 15 时(与最高气温时间较为接近)

3.2.6 多年平均温度、相对湿度日变化叠加图

opar<-par(pin=c(6,4), mar=c(5,5,3,5))

cut <- 5.0

plot(mDiurnalData$Hour, mDiurnalData$Temperature, xaxt="n", yaxt="n", col=rgb(255, 100, 100, maxColorValue=255), type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0,  xlab='Time', ylab = "Temperature", xlim = c(0, 23), ylim=c(0, 20), main="Diurnal Change of Temperature & Relative Humidity in Beijing")

lines(mDiurnalData$Hour, mDiurnalData$RH/cut, xaxt="n", yaxt="n", col="#2233FF", type="b", lty =2, pch=21)

axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)
axis(2, at=seq(0, 20, 1), col.axis="black", las=2, cex.axis=0.8, tck=-0.01)

mylabels <- as.character(seq(0, 20, 2)*cut) 

axis(side=4, at=seq(0, 20, 2), labels = mylabels, col.axis="black", cex.axis=0.8, tck=-0.01)
mtext("Relative Humidity", side = 4, line = 3)

legend("bottomright", ncol=1, inset = .02, c("Temperature", "Relative Humidity"),
       col = c("red", "blue"), lty = c(1, 2), pch = c(20, 21), cex=0.8)

par(opar)

3.2.7 多年平均 PM2.5 日变化图

opar<-par(pin=c(6,4), mar=c(5,5,3,5))

plot(mDiurnalData$Hour, mDiurnalData$PMe, xaxt="n", yaxt="n", col="#ff8080", type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0,lwd=2,  xlab='Time', ylab = "PM2.5 (ug/m3)", xlim = c(0, 23), ylim=c(34, 46), main="Diurnal Change of PM2.5 mass concentration in Beijing")

lines(mDiurnalData$Hour, mDiurnalData$PMw, xaxt="n", yaxt="n", col="#2233FF", type="b", lty =2, pch=21, lwd = 2)

axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)
axis(2, at=seq(34, 46, 2), col.axis="black", las=2, cex.axis=0.8, tck=-0.01)

legend("top", ncol=1, inset = .02, c("US Embassy", "Haidian/Wanliu"),
       col = c("red", "blue"), lty = c(1, 2), pch = c(20, 21), lwd=c(2,2), cex=0.8)

par(opar)

分析: 污染物浓度在 13时 至 17时之间较低, 在 22时 至 (次日)1 时之间达到高峰

The End