1 初级数据管理
1.1 创建数据框
manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age, q1, q2, q3, q4, q5, stringsAsFactors=FALSE)1.2 在数据框leadership中创建变量sumx, meanx
leadership$sumx = leadership$q1 + leadership$q2
leadership$meanx = (leadership$q1 + leadership$q2)/2
#另外一种写法:
attach(leadership)## The following objects are masked _by_ .GlobalEnv:
##
## age, country, date, gender, manager, q1, q2, q3, q4, q5
leadership$meanx <- (q1 + q2)/2
leadership$sumx <- q1 + q2
# meanx <- (q1 + q2)/2
# sumx <- q1 + q2
detach(leadership)
leadership## manager date country gender age q1 q2 q3 q4 q5 sumx meanx
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0
## 5 5 5/1/09 UK F 99 2 2 1 2 1 4 2.0
#第3种更为简洁的方式:
leadership <- transform(leadership, sumx = q1 + q2, meanx = (q1 + q2)/2)
leadership## manager date country gender age q1 q2 q3 q4 q5 sumx meanx
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0
## 5 5 5/1/09 UK F 99 2 2 1 2 1 4 2.0
leadership <- plyr::mutate(leadership, sumx = NULL, meanx = NULL)
leadership <- plyr::mutate(leadership, sumx =q1 + q2, meanx = (q1 + q2)/2)
leadership## manager date country gender age q1 q2 q3 q4 q5 sumx meanx
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0
## 5 5 5/1/09 UK F 99 2 2 1 2 1 4 2.0
1.3 数据框中变量的重编码
将leadership数据框中经理人的连续型年龄变量age重编码为类别型变量agecat(Young、 Middle Aged、Elder)
leadership$age[ leadership$age == 99 ] <- NA #替换缺失值
leadership$agecat[leadership$age > 75] <- "Elder"
leadership$agecat[leadership$age >= 55 &
leadership$age <= 75] <- "Middle Aged"
leadership$agecat[leadership$age < 55] <- "Young"
#或者用下列函数更简洁地实现:
leadership <- within(leadership,{
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age >= 55 & age <= 75] <- "Middle Aged"
agecat[age < 55] <- "Young" })
leadership## manager date country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
1.4 数据框中变量的重命名
#交互的方式
fix(leadership)
#编程的方式
library(plyr) #或者library(reshape)
#rename(dataframe, c(oldname="newname", oldname="newname",…))
#注意这时并没有改变dataframe ,而是返回了一个新的数据框
rename(leadership, c(manager="managerID", date="testDate"))## managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
leadership## manager date country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
leadership <- rename(leadership, c(manager="managerID", date="testDate") )
leadership## managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
#不同包中的同名函数可能调用方式不一样:
#如果不指明包名,后边加载的包里边的函数会屏蔽前面的
#如,在dplyr中 rename(dataframe, newname= oldname, newname= oldname,…))
#为了明确调用不发生混淆,在函数名前指定包:
# dplyr::rename:
library(dplyr) ##
## 载入程辑包:'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dplyr::rename(leadership, date = testDate)## managerID date country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
leadership## managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
#也可以通过names()函数来重命名变量
names(leadership)[2] <- "testDate "
leadership## managerID testDate country gender age q1 q2 q3 q4 q5 sumx meanx agecat
## 1 1 10/24/08 US M 32 5 4 5 5 5 9 4.5 Young
## 2 2 10/28/08 US F 45 3 5 2 5 5 8 4.0 Young
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8 4.0 Young
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6 3.0 Young
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4 2.0 <NA>
#或者直接对leadership中的若干相应变量的名称进行修改:
names(leadership)[6:10] <- c("item1", "item2", "item3", "item4", "item5")
leadership## managerID testDate country gender age item1 item2 item3 item4 item5 sumx
## 1 1 10/24/08 US M 32 5 4 5 5 5 9
## 2 2 10/28/08 US F 45 3 5 2 5 5 8
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8
## 4 4 10/12/08 UK M 39 3 3 4 NA NA 6
## 5 5 5/1/09 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
1.5 缺失值NA
缺失值以符号NA(Not Available,不可用)表示; 不可能出现的值(例如,被0除的结果)通过符号NaN(Not a Number,非数值)来表示。
is.na(leadership[,6:10])## item1 item2 item3 item4 item5
## 1 FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE TRUE TRUE
## 5 FALSE FALSE FALSE FALSE FALSE
#利用na.omit()删除不完整的观测(行)
newdata <- na.omit(leadership)
newdata## managerID testDate country gender age item1 item2 item3 item4 item5 sumx
## 1 1 10/24/08 US M 32 5 4 5 5 5 9
## 2 2 10/28/08 US F 45 3 5 2 5 5 8
## 3 3 10/1/08 UK F 25 3 5 5 5 2 8
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
1.6 日期处理
#从字符串转为日期变量:
names(leadership)[2] <- "Date"
myformat <- "%m/%d/%y"
leadership$Date <- as.Date(leadership$Date, myformat)
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
#处理时间戳数据的函数
#Sys.Date()可以返回当天的日期(系统时间),而date()则返回当前的日期和时间
Sys.Date()## [1] "2023-02-27"
date()## [1] "Mon Feb 27 17:52:23 2023"
today <- Sys.Date()
format(today, format="%m %d %Y")## [1] "02 27 2023"
format(today, format="%A")## [1] "星期一"
format(today-1, format="%A")## [1] "星期日"
#日期计算
startdate <- as.Date("2004-02-13")
enddate <- as.Date("2011-01-22")
enddate - startdate## Time difference of 2535 days
today <- Sys.Date()
dob <- as.Date("1956-10-12")
difftime(today, dob, units="weeks")## Time difference of 3463.429 weeks
#将日期转换为字符型变量
strDates <- as.character(leadership$Date)
strDates## [1] "2008-10-24" "2008-10-28" "2008-10-01" "2008-10-12" "2009-05-01"
a <- c(1,2,3)
is.numeric(a)## [1] TRUE
is.vector(a)## [1] TRUE
a <- as.character(a)
a## [1] "1" "2" "3"
is.numeric(a)## [1] FALSE
is.vector(a)## [1] TRUE
is.character(a)## [1] TRUE
1.7 时间类POSIXct和POSIXlt
POSIXct格式的时间:以有符号整数形式存储,表示从1970-01-01到该时间点经过的秒数; POSIXlt格式的时间:以字符串形式存储,包含年月日时间等。可以方便地提取年、月、日等变量
as.POSIXlt("2010/01/01")## [1] "2010-01-01 CST"
as.POSIXlt("2010/01/01 10:30:30")## [1] "2010-01-01 10:30:30 CST"
tt <- Sys.time()
as.POSIXlt(3600 * 24 * 1001, origin = tt)## [1] "2025-11-24 17:52:23 CST"
as.POSIXlt(3600 * 24 * 1001, origin = tt, tz = "UTC")## [1] "2025-11-24 09:52:23 UTC"
as.POSIXlt("10:30:30 2010/01/01", format = "%H:%M:%S %Y/%m/%d")## [1] "2010-01-01 10:30:30 CST"
(ct <- as.POSIXct("1980-09-30 10:11:12"))## [1] "1980-09-30 10:11:12 CST"
(lt <- as.POSIXlt("1980-09-30 10:11:12"))## [1] "1980-09-30 10:11:12 CST"
unlist(lt)## sec min hour mday mon year wday yday isdst zone gmtoff
## "12" "11" "10" "30" "8" "80" "2" "273" "0" "CST" NA
unlist(ct)## [1] "1980-09-30 10:11:12 CST"
c(1900 + lt$year, lt$mon+1, lt$mday, lt$hour, lt$min, lt$sec)## [1] 1980 9 30 10 11 12
1.8 时区
Sys.timezone()## [1] "Asia/Taipei"
Sys.setenv(TZ ="Asia/Shanghai")
Sys.timezone()## [1] "Asia/Shanghai"
Sys.getenv("TZ")## [1] "Asia/Shanghai"
t <- as.POSIXlt("2008-04-08 7:10:15", format="%Y-%m-%d %H:%M:%S")
t## [1] "2008-04-08 07:10:15 CST"
#不同时区的自动转换:
t <- lubridate::as_datetime(t, tz= "UTC")
t## [1] "2008-04-07 23:10:15 UTC"
1.9 lubridate包的使用
library(lubridate)##
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
ymd("20110604")## [1] "2011-06-04"
mdy("06-04-2011")## [1] "2011-06-04"
dmy("04/06/2011")## [1] "2011-06-04"
arrive <- ymd_hms("2011-06-04 12:00:00", tz = "Asia/Shanghai")
arrive## [1] "2011-06-04 12:00:00 CST"
leave <- ymd_hms("2011-08-10 14:00:00", tz = "Asia/Shanghai")
leave## [1] "2011-08-10 14:00:00 CST"
second(arrive)## [1] 0
second(arrive) <- 25
arrive## [1] "2011-06-04 12:00:25 CST"
second(arrive) <- 0
wday(arrive)## [1] 7
wday(arrive, label = TRUE)## [1] 周六
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
meeting <- ymd_hms("2011-07-01 09:00:00", tz = "Asia/Shanghai")
with_tz(meeting, "GMT")## [1] "2011-07-01 01:00:00 GMT"
meeting## [1] "2011-07-01 09:00:00 CST"
(mistake <- force_tz(meeting, "UTC"))## [1] "2011-07-01 09:00:00 UTC"
with_tz(mistake, "Asia/Shanghai")## [1] "2011-07-01 17:00:00 CST"
meeting <- ymd_hms("2011-07-01 09:00:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
with_tz(meeting, "Asia/Shanghai")## [1] "2011-07-01 17:00:00 CST"
meeting <- ymd_hm("2011-07-01 09:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- ymd_hm("11-07-01 09:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- ymd_h("2011-07-01 09")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hms("01-07-2011 09:00:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hms("01-07-11 09:00:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hm("01-07-11 09:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_h("01-07-11 09")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hm("010711 09:00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- dmy_hm("01/07/11 09/00")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- mdy_hms("070111 09:00:15")
meeting## [1] "2011-07-01 09:00:15 UTC"
meeting <- mdy_hm("0701110915")
meeting## [1] "2011-07-01 09:15:00 UTC"
meeting <- mdy_h("07011109")
meeting## [1] "2011-07-01 09:00:00 UTC"
meeting <- ydm_hms("11-01-07 09:15:00")
meeting## [1] "2011-07-01 09:15:00 UTC"
meeting <- ydm_hm("11-01-07 09:15")
meeting## [1] "2011-07-01 09:15:00 UTC"
meeting <- ydm_h("11010709")
meeting## [1] "2011-07-01 09:00:00 UTC"
#Time Intervals
interval1 <- interval(arrive, leave)
interval1## [1] 2011-06-04 12:00:00 CST--2011-08-10 14:00:00 CST
interval1 <- arrive %--% leave
interval1## [1] 2011-06-04 12:00:00 CST--2011-08-10 14:00:00 CST
str(interval1)## Formal class 'Interval' [package "lubridate"] with 3 slots
## ..@ .Data: num 5796000
## ..@ start: POSIXct[1:1], format: "2011-06-04 12:00:00"
## ..@ tzone: chr "Asia/Shanghai"
interval2 <- interval(ymd(20110720, tz = "Asia/Shanghai"), ymd(20110831, tz = "Asia/Shanghai"))
interval2## [1] 2011-07-20 CST--2011-08-31 CST
int_overlaps(interval2, interval1)## [1] TRUE
#[1] TRUE
setdiff(interval1, interval2)## [1] 2011-06-04 12:00:00 CST--2011-07-20 CST
setdiff(interval2, interval1)## [1] 2011-08-10 14:00:00 CST--2011-08-31 CST
#Arithmetic with date times
minutes(2) ## period## [1] "2M 0S"
dminutes(2) ## duration## [1] "120s (~2 minutes)"
leap_year(2011) ## regular year## [1] FALSE
# [1] FALSE
ymd(20110101) + dyears(1)## [1] "2012-01-01 06:00:00 UTC"
ymd(20110101) + years(1)## [1] "2012-01-01"
leap_year(2012) ## leap year## [1] TRUE
#> [1] TRUE
meeting## [1] "2011-07-01 09:00:00 UTC"
(meetings <- meeting + weeks(0:5))## [1] "2011-07-01 09:00:00 UTC" "2011-07-08 09:00:00 UTC"
## [3] "2011-07-15 09:00:00 UTC" "2011-07-22 09:00:00 UTC"
## [5] "2011-07-29 09:00:00 UTC" "2011-08-05 09:00:00 UTC"
meetings %within% interval2## [1] FALSE FALSE FALSE TRUE TRUE TRUE
#[1] FALSE FALSE FALSE TRUE TRUE TRUE
interval1 / ddays(1)## [1] 67.08333
#> [1] 67.08333
interval1 / ddays(2)## [1] 33.54167
#> [1] 33.54167
interval1 / dminutes(1)## [1] 96600
#> [1] 96600
interval1## [1] 2011-06-04 12:00:00 CST--2011-08-10 14:00:00 CST
interval1 %/% months(1)## [1] 2
#> [1] 2
interval1 %% months(1)## [1] 2011-08-04 12:00:00 CST--2011-08-10 14:00:00 CST
as.period(interval1 %% months(1))## [1] "6d 2H 0M 0S"
#> [1] "6d 2H 0M 0S"
as.period(interval1)## [1] "2m 6d 2H 0M 0S"
#> [1] "2m 6d 2H 0M 0S"
jan31 <- ymd("2013-01-31")
jan31 + months(0:11)## [1] "2013-01-31" NA "2013-03-31" NA "2013-05-31"
## [6] NA "2013-07-31" "2013-08-31" NA "2013-10-31"
## [11] NA "2013-12-31"
#> [1] "2013-01-31" NA "2013-03-31" NA "2013-05-31"
#> [6] NA "2013-07-31" "2013-08-31" NA "2013-10-31"
#> [11] NA "2013-12-31"
floor_date(jan31, "month")## [1] "2013-01-01"
floor_date(jan31, "month") + months(0:11) + days(31)## [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01"
## [6] "2013-07-02" "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01"
## [11] "2013-12-02" "2014-01-01"
#> [1] "2013-02-01" "2013-03-04" "2013-04-01" "2013-05-02" "2013-06-01"
#> [6] "2013-07-02" "2013-08-01" "2013-09-01" "2013-10-02" "2013-11-01"
#> [11] "2013-12-02" "2014-01-01"
jan31 %m+% months(0:11)## [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31"
## [6] "2013-06-30" "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31"
## [11] "2013-11-30" "2013-12-31"
#> [1] "2013-01-31" "2013-02-28" "2013-03-31" "2013-04-30" "2013-05-31"
#> [6] "2013-06-30" "2013-07-31" "2013-08-31" "2013-09-30" "2013-10-31"
#> [11] "2013-11-30" "2013-12-31"
(jan02 <- ymd("2013-01-02"))## [1] "2013-01-02"
ceiling_date(jan02, "month")## [1] "2013-02-01"
ceiling_date(jan02, "year")## [1] "2014-01-01"
ceiling_date(jan02, "day")## [1] "2013-01-03"
last_day <- function(date) {
ceiling_date(date, "month") - days(1)
}
(jan02 <- ymd("2013-01-02"))## [1] "2013-01-02"
last_day(jan02)## [1] "2013-01-31"
1.10 数据排序
sorting data: order( )
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
order(leadership$age) #返回的是排序序号## [1] 3 1 4 2 5
newdata <- leadership[ order(leadership$age), ]
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 3 4.0 Young
## 1 4.5 Young
## 4 3.0 Young
## 2 4.0 Young
## 5 2.0 <NA>
newdata[2,]## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## meanx agecat
## 1 4.5 Young
#先按照性别,再按照年龄排队:
attach(leadership)## The following objects are masked _by_ .GlobalEnv:
##
## age, country, gender
newdata <- leadership[order(gender, age),]
detach(leadership)
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## meanx agecat
## 3 4.0 Young
## 2 4.0 Young
## 5 2.0 <NA>
## 1 4.5 Young
## 4 3.0 Young
#倒序:
attach(leadership)## The following objects are masked _by_ .GlobalEnv:
##
## age, country, gender
newdata <-leadership[order(gender, -age),]
detach(leadership)
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## meanx agecat
## 5 2.0 <NA>
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 1 4.5 Young
1.11 数据集的合并 Merging datasets
添加列—横向合并两个数据框(数据集): total <- merge(dataframeA, dataframeB, by=“ID”) total <- merge(dataframeA, dataframeB, by=c(“ID”,“Country”))
如果要直接横向合并两个矩阵或数据框,并且不需要指定一个公共索引,那么可以直接使用cbind()函数: total <- cbind(A, B)
添加行—纵向合并两个数据框(数据集), 使用rbind()函数: total <- rbind(dataframeA, dataframeB)
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## meanx agecat
## 5 2.0 <NA>
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 1 4.5 Young
total <- merge(leadership, newdata, by="managerID")
total## managerID Date.x country.x gender.x age.x item1.x item2.x item3.x item4.x
## 1 1 2008-10-24 US M 32 5 4 5 5
## 2 2 2008-10-28 US F 45 3 5 2 5
## 3 3 2008-10-01 UK F 25 3 5 5 5
## 4 4 2008-10-12 UK M 39 3 3 4 NA
## 5 5 2009-05-01 UK F NA 2 2 1 2
## item5.x sumx.x meanx.x agecat.x Date.y country.y gender.y age.y item1.y
## 1 5 9 4.5 Young 2008-10-24 US M 32 5
## 2 5 8 4.0 Young 2008-10-28 US F 45 3
## 3 2 8 4.0 Young 2008-10-01 UK F 25 3
## 4 NA 6 3.0 Young 2008-10-12 UK M 39 3
## 5 1 4 2.0 <NA> 2009-05-01 UK F NA 2
## item2.y item3.y item4.y item5.y sumx.y meanx.y agecat.y
## 1 4 5 5 5 9 4.5 Young
## 2 5 2 5 5 8 4.0 Young
## 3 5 5 5 2 8 4.0 Young
## 4 3 4 NA NA 6 3.0 Young
## 5 2 1 2 1 4 2.0 <NA>
total <- cbind(leadership, newdata)
total #简单靠一起, 名字发生重复## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat managerID Date country gender age item1 item2 item3 item4
## 1 4.5 Young 5 2009-05-01 UK F NA 2 2 1 2
## 2 4.0 Young 2 2008-10-28 US F 45 3 5 2 5
## 3 4.0 Young 3 2008-10-01 UK F 25 3 5 5 5
## 4 3.0 Young 4 2008-10-12 UK M 39 3 3 4 NA
## 5 2.0 <NA> 1 2008-10-24 US M 32 5 4 5 5
## item5 sumx meanx agecat
## 1 1 4 2.0 <NA>
## 2 5 8 4.0 Young
## 3 2 8 4.0 Young
## 4 NA 6 3.0 Young
## 5 5 9 4.5 Young
names(total)[11:15] <- c("manager1", "Date1", "Country1", "gender1", "age1")
total## managerID Date country gender age item1 item2 item3 item4 item5
## 1 1 2008-10-24 US M 32 5 4 5 5 5
## 2 2 2008-10-28 US F 45 3 5 2 5 5
## 3 3 2008-10-01 UK F 25 3 5 5 5 2
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA
## 5 5 2009-05-01 UK F NA 2 2 1 2 1
## manager1 Date1 Country1 gender1 age1 country gender age item1 item2
## 1 9 4.5 Young 5 2009-05-01 UK F NA 2 2
## 2 8 4.0 Young 2 2008-10-28 US F 45 3 5
## 3 8 4.0 Young 3 2008-10-01 UK F 25 3 5
## 4 6 3.0 Young 4 2008-10-12 UK M 39 3 3
## 5 4 2.0 <NA> 1 2008-10-24 US M 32 5 4
## item3 item4 item5 sumx meanx agecat
## 1 1 2 1 4 2.0 <NA>
## 2 2 5 5 8 4.0 Young
## 3 5 5 2 8 4.0 Young
## 4 4 NA NA 6 3.0 Young
## 5 5 5 5 9 4.5 Young
total <- rbind(leadership, newdata)
total## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## 51 5 2009-05-01 UK F NA 2 2 1 2 1 4
## 21 2 2008-10-28 US F 45 3 5 2 5 5 8
## 31 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 41 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 11 1 2008-10-24 US M 32 5 4 5 5 5 9
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
## 51 2.0 <NA>
## 21 4.0 Young
## 31 4.0 Young
## 41 3.0 Young
## 11 4.5 Young
1.12 数据集取子集
从leadership数据框中选择了变量q1、q2、q3、q4和q5,并将它们保存到了数据框newdata中: 一下三种方式功能一样的:
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
newdata <- leadership[, c(6:10)]
newdata## item1 item2 item3 item4 item5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
newdata <- leadership[c(6:10)]
newdata## item1 item2 item3 item4 item5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
myvars <- c("item1", "item2", "item3", "item4", "item5")
newdata <-leadership[myvars]
newdata## item1 item2 item3 item4 item5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
myvars <- paste("item", 1:5, sep="")
newdata <- leadership[myvars]
newdata## item1 item2 item3 item4 item5
## 1 5 4 5 5 5
## 2 3 5 2 5 5
## 3 3 5 5 5 2
## 4 3 3 4 NA NA
## 5 2 2 1 2 1
1.13 剔除(丢弃)变量
myvars <- names(leadership) %in% c("item3", "item4")
myvars## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [13] FALSE
newdata <- leadership[ !myvars ]
newdata## managerID Date country gender age item1 item2 item5 sumx meanx agecat
## 1 1 2008-10-24 US M 32 5 4 5 9 4.5 Young
## 2 2 2008-10-28 US F 45 3 5 5 8 4.0 Young
## 3 3 2008-10-01 UK F 25 3 5 2 8 4.0 Young
## 4 4 2008-10-12 UK M 39 3 3 NA 6 3.0 Young
## 5 5 2009-05-01 UK F NA 2 2 1 4 2.0 <NA>
#在知道q3和q4是第8个和第9个变量的情况下:
newdata <- leadership[ c(-8, -9) ]
newdata## managerID Date country gender age item1 item2 item5 sumx meanx agecat
## 1 1 2008-10-24 US M 32 5 4 5 9 4.5 Young
## 2 2 2008-10-28 US F 45 3 5 5 8 4.0 Young
## 3 3 2008-10-01 UK F 25 3 5 2 8 4.0 Young
## 4 4 2008-10-12 UK M 39 3 3 NA 6 3.0 Young
## 5 5 2009-05-01 UK F NA 2 2 1 4 2.0 <NA>
#或者直接赋值NULL :
leadership$q3 <- leadership$q4 <- NULL
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
1.14 选入观测(行)
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
## 4 3.0 Young
## 5 2.0 <NA>
newdata <- leadership[1:3,]
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## meanx agecat
## 1 4.5 Young
## 2 4.0 Young
## 3 4.0 Young
newdata <- leadership[which(leadership$gender=="M" &
leadership$age > 30), ]
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## meanx agecat
## 1 4.5 Young
## 4 3.0 Young
attach(leadership)## The following objects are masked _by_ .GlobalEnv:
##
## age, country, gender
newdata <- leadership[which(gender=='M' & age > 30), ]
detach(leadership)
leadership$date <- as.Date(leadership$Date, "%m/%d/%y")
startdate <- as.Date("2009-01-01")
enddate <- as.Date("2009-10-31")
newdata <- leadership[which(leadership$date >= startdate &
leadership$date <= enddate), ]
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat date
## 5 2 <NA> 2009-05-01
x <- c(1,2,3,4,5,6,7,8,9,10)
x[c(TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE)]## [1] 1 10
#1 10
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat date
## 1 4.5 Young 2008-10-24
## 2 4.0 Young 2008-10-28
## 3 4.0 Young 2008-10-01
## 4 3.0 Young 2008-10-12
## 5 2.0 <NA> 2009-05-01
newdata <- leadership[ leadership$date >= startdate & leadership$date <= enddate, ]
newdata## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat date
## 5 2 <NA> 2009-05-01
subset
#在下面第一个示例中,选择了所有age值大于等于35或age值小于24的行,保留了变量q1、q2:
leadership ## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat date
## 1 4.5 Young 2008-10-24
## 2 4.0 Young 2008-10-28
## 3 4.0 Young 2008-10-01
## 4 3.0 Young 2008-10-12
## 5 2.0 <NA> 2009-05-01
newdata <- subset(leadership, age >= 35 | age < 24, select=c(item1, item2))
newdata## item1 item2
## 2 3 5
## 4 3 3
#在下面第二个示例中,选择了所有25岁以上的男性,并保留了变量gender到q2(gender、q4和其间所有列):
leadership## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 2 2 2008-10-28 US F 45 3 5 2 5 5 8
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 5 5 2009-05-01 UK F NA 2 2 1 2 1 4
## meanx agecat date
## 1 4.5 Young 2008-10-24
## 2 4.0 Young 2008-10-28
## 3 4.0 Young 2008-10-01
## 4 3.0 Young 2008-10-12
## 5 2.0 <NA> 2009-05-01
newdata <- subset(leadership, gender=="M" & age > 25, select=gender:item2)
newdata ## gender age item1 item2
## 1 M 32 5 4
## 4 M 39 3 3
1.15 数据抽取sample
#从leadership数据集中随机抽取一个大小为3的样本:
mysample <- leadership[sample(1:nrow(leadership), 3, replace=FALSE),]
#sample 函数, replace=FALSE 表示不放回
mysample## managerID Date country gender age item1 item2 item3 item4 item5 sumx
## 1 1 2008-10-24 US M 32 5 4 5 5 5 9
## 4 4 2008-10-12 UK M 39 3 3 4 NA NA 6
## 3 3 2008-10-01 UK F 25 3 5 5 5 2 8
## meanx agecat date
## 1 4.5 Young 2008-10-24
## 4 3.0 Young 2008-10-12
## 3 4.0 Young 2008-10-01
1.16 使用SQL 语句操作数据框 (可选内容)
# install.packages("sqldf")
library(sqldf)## 载入需要的程辑包:gsubfn
## 载入需要的程辑包:proto
## 载入需要的程辑包:RSQLite
mtcars## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
newdf <- sqldf("select * from mtcars where carb=1 order by mpg", row.names=TRUE)
newdf## mpg cyl disp hp drat wt qsec vs am gear carb
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
from mtcars where cyl in (4, 6) group by gear")## avg_mpg avg_disp gear
## 1 20.33333 201.0333 3
## 2 24.53333 123.0167 4
## 3 25.36667 120.1333 5
1.17 初级数据管理—Summary
基础知识: R存储缺失值和日期值的方式,处理方法; 如何确定一个对象的数据类型,如何将它转换为其他类型; 使用简单的公式创建了新变量并重编码了现有变量; 展示了如何对数据进行排序和对变量进行重命名; 如何对数据集进行横向合并(添加变量)和纵向合并(添加观测); 讨论了如何保留或丢弃变量,以及如何基于一系列的准则选取观测。
1.17.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1
1.17.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.2
2 高级数据管理
2.1 数学和统计函数
x = c( 0 : 360 )
y = sin( x/180 * pi )
plot(x, y, type="p", cex = 0.5, las=1, col="blue",
cex.main=2.0, cex.axis = 1.0, cex.lab = 2.0,
main="Sine Function", xlab="Degree", ylab="y = sin(x)" )
abline ( h = 0.0, lty = 2 )# 统计函数
xm <- mean(x)
ym <- mean(y, trim=0.05,
na.rm = TRUE)
# 截尾平均数,即丢弃了最大5%和最小5%的数据和所有缺失值后的算术平均数
# 复杂语句:
x <- c(1:8)
n <- length(x)
meanx <- sum(x)/n
css <- sum(( x - meanx)^2)
sdx <- sqrt(css/(n-1))
# 简洁语句:
mean(x)## [1] 4.5
sd(x)## [1] 2.44949
# 数据的标准化
# 在面试录取研究生的时候,有20位老师给100位学生打分,为了加快面试速度,可能又随机分了组,比如分两组,各10位老师给50位学生打分;
# 这里边如果直接平均老师的分数,存在一些非常不合理的问题,每个老师打分的最小、最大、标准差、均值都不一样,所以要对数据作标准化(Standardizing data):
score1 <- c(92, 95, 95, 89, 79, 60, 78, 50, 30, 80, 76, 87)
score2 <- c(95, 95, 89, 79, 60, 78, 50, 30, 80, 76, 87, 67)
score3 <- c(95, 89, 79, 60, 78, 50, 30, 80, 76, 87, 57, 48)
score4 <- c(89, 79, 60, 78, 50, 30, 80, 76, 87, 49, 87, 67)
score5 <- c(99, 98, 96, 99, 89, 90, 88, 90, 80, 90, 96, 97)
mydata <- data.frame(score1, score2, score3, score4, score5)
mydata## score1 score2 score3 score4 score5
## 1 92 95 95 89 99
## 2 95 95 89 79 98
## 3 95 89 79 60 96
## 4 89 79 60 78 99
## 5 79 60 78 50 89
## 6 60 78 50 30 90
## 7 78 50 30 80 88
## 8 50 30 80 76 90
## 9 30 80 76 87 80
## 10 80 76 87 49 90
## 11 76 87 57 87 96
## 12 87 67 48 67 97
newdata <- scale(mydata) #均值为0, 标准差为1
newdata## score1 score2 score3 score4 score5
## [1,] 0.809248084 1.0945466 1.3109797 1.0630748 1.0969655
## [2,] 0.960195913 1.0945466 1.0074732 0.5225283 0.9237604
## [3,] 0.960195913 0.7842814 0.5016289 -0.5045101 0.5773503
## [4,] 0.658300255 0.2671728 -0.4594752 0.4684737 1.0969655
## [5,] 0.155140824 -0.7153336 0.4510445 -1.0450566 -0.6350853
## [6,] -0.800862094 0.2154619 -0.9653195 -2.1261497 -0.4618802
## [7,] 0.104824881 -1.2324423 -1.9770080 0.5765830 -0.8082904
## [8,] -1.304021524 -2.2666595 0.5522133 0.3603643 -0.4618802
## [9,] -2.310340385 0.3188837 0.3498756 0.9549655 -2.1939310
## [10,] 0.205456767 0.1120402 0.9063043 -1.0991113 -0.4618802
## [11,] 0.004192995 0.6808597 -0.6112285 0.9549655 0.5773503
## [12,] 0.557668369 -0.3533576 -1.0664883 -0.1261275 0.7505553
## attr(,"scaled:center")
## score1 score2 score3 score4 score5
## 75.91667 73.83333 69.08333 69.33333 92.66667
## attr(,"scaled:scale")
## score1 score2 score3 score4 score5
## 19.874416 19.338296 19.768930 18.499795 5.773503
attr(newdata,"scaled:scale")## score1 score2 score3 score4 score5
## 19.874416 19.338296 19.768930 18.499795 5.773503
attr(newdata,"scaled:center")## score1 score2 score3 score4 score5
## 75.91667 73.83333 69.08333 69.33333 92.66667
newdata <- scale(mydata)*10 + 50 #标准差10,均值50
newdata## score1 score2 score3 score4 score5
## [1,] 58.09248 60.94547 63.10980 60.63075 60.96966
## [2,] 59.60196 60.94547 60.07473 55.22528 59.23760
## [3,] 59.60196 57.84281 55.01629 44.95490 55.77350
## [4,] 56.58300 52.67173 45.40525 54.68474 60.96966
## [5,] 51.55141 42.84666 54.51044 39.54943 43.64915
## [6,] 41.99138 52.15462 40.34681 28.73850 45.38120
## [7,] 51.04825 37.67558 30.22992 55.76583 41.91710
## [8,] 36.95978 27.33340 55.52213 53.60364 45.38120
## [9,] 26.89660 53.18884 53.49876 59.54966 28.06069
## [10,] 52.05457 51.12040 59.06304 39.00889 45.38120
## [11,] 50.04193 56.80860 43.88772 59.54966 55.77350
## [12,] 55.57668 46.46642 39.33512 48.73872 57.50555
## attr(,"scaled:center")
## score1 score2 score3 score4 score5
## 75.91667 73.83333 69.08333 69.33333 92.66667
## attr(,"scaled:scale")
## score1 score2 score3 score4 score5
## 19.874416 19.338296 19.768930 18.499795 5.773503
res1 <- rowMeans(mydata)
res1## [1] 94.0 91.2 83.8 81.0 71.2 61.6 65.2 65.2 70.6 76.4 80.6 73.2
myres1 <- data.frame(mydata, res1)
myres1[ order(myres1$res1), ]## score1 score2 score3 score4 score5 res1
## 6 60 78 50 30 90 61.6
## 7 78 50 30 80 88 65.2
## 8 50 30 80 76 90 65.2
## 9 30 80 76 87 80 70.6
## 5 79 60 78 50 89 71.2
## 12 87 67 48 67 97 73.2
## 10 80 76 87 49 90 76.4
## 11 76 87 57 87 96 80.6
## 4 89 79 60 78 99 81.0
## 3 95 89 79 60 96 83.8
## 2 95 95 89 79 98 91.2
## 1 92 95 95 89 99 94.0
myres1## score1 score2 score3 score4 score5 res1
## 1 92 95 95 89 99 94.0
## 2 95 95 89 79 98 91.2
## 3 95 89 79 60 96 83.8
## 4 89 79 60 78 99 81.0
## 5 79 60 78 50 89 71.2
## 6 60 78 50 30 90 61.6
## 7 78 50 30 80 88 65.2
## 8 50 30 80 76 90 65.2
## 9 30 80 76 87 80 70.6
## 10 80 76 87 49 90 76.4
## 11 76 87 57 87 96 80.6
## 12 87 67 48 67 97 73.2
res2 <- rowMeans(newdata) #行平均
myres2 <- data.frame(mydata, res2)
myres2[ order(myres2$res2), ]## score1 score2 score3 score4 score5 res2
## 6 60 78 50 30 90 41.72250
## 7 78 50 30 80 88 43.32733
## 8 50 30 80 76 90 43.76003
## 9 30 80 76 87 80 44.23891
## 5 79 60 78 50 89 46.42142
## 10 80 76 87 49 90 49.32562
## 12 87 67 48 67 97 49.52450
## 11 76 87 57 87 96 53.21228
## 4 89 79 60 78 99 54.06287
## 3 95 89 79 60 96 54.63789
## 2 95 95 89 79 98 59.01701
## 1 92 95 95 89 99 60.74963
#对数据中某一变量标准化:
newdata <- transform(mydata, score5 = scale(score5)*10+50)
newdata## score1 score2 score3 score4 score5
## 1 92 95 95 89 60.96966
## 2 95 95 89 79 59.23760
## 3 95 89 79 60 55.77350
## 4 89 79 60 78 60.96966
## 5 79 60 78 50 43.64915
## 6 60 78 50 30 45.38120
## 7 78 50 30 80 41.91710
## 8 50 30 80 76 45.38120
## 9 30 80 76 87 28.06069
## 10 80 76 87 49 45.38120
## 11 76 87 57 87 55.77350
## 12 87 67 48 67 57.50555
2.2 概率密度函数(简称密度函数)
描述随机变量在某个确定的取值点附近的可能性的函数。
x <- pretty(33.5:39.5, 600) #创建美观的分割点,将连续变量x分割为n个区间
ynorm <- dnorm(x, mean=36.5, sd = 0.8)
y1.5 <- dnorm(x, mean=36.5, sd = 1.5)
plot(ynorm ~ x, pch=1, col="DarkTurquoise", ylim=c(0,0.6), xlab="Temperature of the body", ylab="Density",main="Density Functions", xaxs="i", yaxs="i")
lines(x, y1.5, col="blue", lty=1, lwd=3)x <- pretty(-3:3, 30)
ynorm <- dnorm(x)
yt <- dt(x, df=1)
plot(ynorm ~ x, pch=15, col="DarkTurquoise", ylim=c(0,0.4), xlab="Vector of Quantiles", ylab="Density",main="Density Distribution Functions")plot(ynorm ~ x, pch=15, col="DarkTurquoise", ylim=c(0,0.4), xlab="Vector of Quantiles", ylab="Density",main="Density Distribution Functions", xaxs="i", yaxs="i")
lines(x, ynorm, col="DarkTurquoise", lty=1)
points(x, yt, pch=16,col="DeepPink",cex=1)
lines(x, yt, col="DeepPink", lty=2, lwd=2)
legend(1.8,0.39,c("Normal","Student T"), col=c("DarkTurquoise","DeepPink"), text.col=c("DarkTurquoise","DeepPink"), pch=c(15,16), lty=c(1,2))pnorm(1.96) #0.975 位于x=1.96左侧曲线下方的面积## [1] 0.9750021
pt(1.96, 1) #0.850## [1] 0.8498286
x <- pretty(33.5:39.5, 600)
ynorm <- dnorm(x, mean=36.5, sd = 0.8)
plot(ynorm~x, pch=1, col="DarkTurquoise", ylim=c(0,1.0),
xlab="Temperature of the body", ylab="Density",main="probability Density Functions (pdf) & Cumulative Distribution Function (CDF)",
xaxs="i", yaxs="i")
yp <- pnorm(x, mean=36.5, sd = 0.8)
lines(x, yp, col="blue", lwd=3)qnorm(0.95, mean=36.5, sd=0.8)## [1] 37.81588
qnorm(0.75, mean=36.5, sd=0.8)## [1] 37.03959
qnorm(0.50, mean=36.5, sd=0.8)## [1] 36.5
qnorm(0.25, mean=36.5, sd=0.8)## [1] 35.96041
2.3 随机数
#生成50个均值为50,标准差为10的正态随机数
rnorm(100, mean=50, sd=10)## [1] 53.00942 38.16144 46.90055 28.15861 66.41598 48.23867 60.11290 43.71823
## [9] 72.75052 63.67231 70.69437 57.63203 51.10008 64.82505 39.86845 55.29645
## [17] 38.80570 46.36775 49.19441 38.64582 50.87240 51.36359 54.20151 60.72772
## [25] 80.86780 56.62534 41.91201 52.79017 46.13687 49.17425 49.58915 60.63855
## [33] 38.67791 47.77770 37.45722 38.53883 51.44932 47.19463 64.08087 45.62670
## [41] 66.21964 52.01476 41.36608 38.85086 39.92983 46.09607 46.92721 52.76771
## [49] 38.43757 46.49249 48.02060 58.02652 42.27039 35.49569 55.12285 58.04769
## [57] 39.94541 61.87370 56.31730 56.29335 38.58363 58.14960 58.38641 59.75349
## [65] 38.10810 33.42889 31.94336 50.76991 53.90309 47.86091 54.64006 61.66393
## [73] 61.98280 63.75532 68.94953 42.36623 39.37098 34.09456 50.78561 52.80675
## [81] 38.73806 57.76137 54.21911 53.25080 66.69240 60.65281 64.22991 46.10097
## [89] 51.94213 54.19566 61.55201 57.36599 65.12016 54.86770 43.53554 40.89651
## [97] 41.75186 53.46316 24.51178 71.76688
x <- seq(1:100)
y <- rnorm(100, mean=50, sd=10)
plot(x, y, pch=17, col="blue", cex=1,
cex.axis=1.5, xlim=c(0, 100), ylim=c(20,80), main="rnorm(100, mean=50, sd=10)", xaxs="i", yaxs="i")
lines(x, y, lty=2, col="red", lwd=2)
library(Hmisc)## 载入需要的程辑包:lattice
## 载入需要的程辑包:survival
## 载入需要的程辑包:Formula
## 载入需要的程辑包:ggplot2
##
## 载入程辑包:'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
minor.tick(nx=2, ny=2, tick.ratio = 0.5)2.4 设定随机数种子
在每次生成伪随机数的时候,函数都会使用一个不同的种子,因此也会产生不同的结果。 如果需要结果重现(reproducible),可通过函数set.seed()显式指定这个种子。 函数runif()用来生成0到1区间上服从均匀分布的伪随机数。
runif(5)## [1] 0.9482966 0.1848350 0.2432801 0.6826676 0.4987312
runif(5)## [1] 0.27944116 0.03861113 0.74238413 0.53629883 0.87030823
set.seed(1234)
runif(5)## [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
set.seed(1234)
runif(5)## [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
2.5 将函数应用于矩阵和数据框
避免使用循环语句!
#R函数可以应用到一系列的数据对象上,包括标量、向量、矩阵、数组和数据框
a <- 5
sqrt(a)## [1] 2.236068
b <- c(1.24, 5.65, 2.99)
sqrt(b)## [1] 1.113553 2.376973 1.729162
round(b)## [1] 1 6 3
c <- matrix(runif(12), nrow=3)
log(c)## [,1] [,2] [,3] [,4]
## [1,] -0.4458019 -0.4063399 -0.60701566 -1.2299204
## [2,] -4.6569103 -0.6650435 -1.26325023 -0.1775781
## [3,] -1.4586478 -0.3658724 -0.07965651 -1.2509831
mean(c)## [1] 0.493605
mydata <- matrix(rnorm(30), nrow=6)
mydata## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.6224568 -1.68732684 0.37563561 0.011395161 0.54317290
## [2,] -0.7315360 -0.62743621 0.31026217 0.009859946 0.02142719
## [3,] -0.5166697 0.01831663 0.00500695 0.678271423 0.16256579
## [4,] -1.7507334 0.70524346 -0.03763026 1.029563029 1.24175425
## [5,] 0.8801042 -0.64701901 0.72397606 -1.729528504 0.78277748
## [6,] 1.3700104 0.86818087 -0.49673886 -2.204348095 0.04812071
#apply(x, MARGIN, FUN, ...)
#x为数据对象,MARGIN是维度的下标,FUN是由你指定的函数,
#而...则包括了任何想传递给FUN的参数。MARGIN=1表示行,MARGIN=2表示列。
apply(mydata, 1, mean)## [1] -0.275915992 -0.203484580 0.069498214 0.237639408 0.002062036
## [6] -0.082955004
apply(mydata, 2, mean)## [1] -0.2285469 -0.2283402 0.1467519 -0.3674645 0.4666364
apply(mydata, 2, mean, trim=0.2)## [1] -0.2476396 -0.1377238 0.1633186 -0.2575005 0.3841592
#(trim=0.2截去首尾各20%后平均)
#apply用于多维数组及自定义函数
myfun <- function(x){
sum(x^2)
}
x<-array(1:24, c(2,3,4))
x## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 7 9 11
## [2,] 8 10 12
##
## , , 3
##
## [,1] [,2] [,3]
## [1,] 13 15 17
## [2,] 14 16 18
##
## , , 4
##
## [,1] [,2] [,3]
## [1,] 19 21 23
## [2,] 20 22 24
apply(x, 1, sum)## [1] 144 156
apply(x, 2, sum)## [1] 84 100 116
apply(x, 3, sum)## [1] 21 57 93 129
apply(x, 1, myfun)## [1] 2300 2600
#apply, tapply, lapply, sapply的区别
iris## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 12 4.8 3.4 1.6 0.2 setosa
## 13 4.8 3.0 1.4 0.1 setosa
## 14 4.3 3.0 1.1 0.1 setosa
## 15 5.8 4.0 1.2 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 17 5.4 3.9 1.3 0.4 setosa
## 18 5.1 3.5 1.4 0.3 setosa
## 19 5.7 3.8 1.7 0.3 setosa
## 20 5.1 3.8 1.5 0.3 setosa
## 21 5.4 3.4 1.7 0.2 setosa
## 22 5.1 3.7 1.5 0.4 setosa
## 23 4.6 3.6 1.0 0.2 setosa
## 24 5.1 3.3 1.7 0.5 setosa
## 25 4.8 3.4 1.9 0.2 setosa
## 26 5.0 3.0 1.6 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 29 5.2 3.4 1.4 0.2 setosa
## 30 4.7 3.2 1.6 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 32 5.4 3.4 1.5 0.4 setosa
## 33 5.2 4.1 1.5 0.1 setosa
## 34 5.5 4.2 1.4 0.2 setosa
## 35 4.9 3.1 1.5 0.2 setosa
## 36 5.0 3.2 1.2 0.2 setosa
## 37 5.5 3.5 1.3 0.2 setosa
## 38 4.9 3.6 1.4 0.1 setosa
## 39 4.4 3.0 1.3 0.2 setosa
## 40 5.1 3.4 1.5 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 42 4.5 2.3 1.3 0.3 setosa
## 43 4.4 3.2 1.3 0.2 setosa
## 44 5.0 3.5 1.6 0.6 setosa
## 45 5.1 3.8 1.9 0.4 setosa
## 46 4.8 3.0 1.4 0.3 setosa
## 47 5.1 3.8 1.6 0.2 setosa
## 48 4.6 3.2 1.4 0.2 setosa
## 49 5.3 3.7 1.5 0.2 setosa
## 50 5.0 3.3 1.4 0.2 setosa
## 51 7.0 3.2 4.7 1.4 versicolor
## 52 6.4 3.2 4.5 1.5 versicolor
## 53 6.9 3.1 4.9 1.5 versicolor
## 54 5.5 2.3 4.0 1.3 versicolor
## 55 6.5 2.8 4.6 1.5 versicolor
## 56 5.7 2.8 4.5 1.3 versicolor
## 57 6.3 3.3 4.7 1.6 versicolor
## 58 4.9 2.4 3.3 1.0 versicolor
## 59 6.6 2.9 4.6 1.3 versicolor
## 60 5.2 2.7 3.9 1.4 versicolor
## 61 5.0 2.0 3.5 1.0 versicolor
## 62 5.9 3.0 4.2 1.5 versicolor
## 63 6.0 2.2 4.0 1.0 versicolor
## 64 6.1 2.9 4.7 1.4 versicolor
## 65 5.6 2.9 3.6 1.3 versicolor
## 66 6.7 3.1 4.4 1.4 versicolor
## 67 5.6 3.0 4.5 1.5 versicolor
## 68 5.8 2.7 4.1 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 70 5.6 2.5 3.9 1.1 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 72 6.1 2.8 4.0 1.3 versicolor
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 75 6.4 2.9 4.3 1.3 versicolor
## 76 6.6 3.0 4.4 1.4 versicolor
## 77 6.8 2.8 4.8 1.4 versicolor
## 78 6.7 3.0 5.0 1.7 versicolor
## 79 6.0 2.9 4.5 1.5 versicolor
## 80 5.7 2.6 3.5 1.0 versicolor
## 81 5.5 2.4 3.8 1.1 versicolor
## 82 5.5 2.4 3.7 1.0 versicolor
## 83 5.8 2.7 3.9 1.2 versicolor
## 84 6.0 2.7 5.1 1.6 versicolor
## 85 5.4 3.0 4.5 1.5 versicolor
## 86 6.0 3.4 4.5 1.6 versicolor
## 87 6.7 3.1 4.7 1.5 versicolor
## 88 6.3 2.3 4.4 1.3 versicolor
## 89 5.6 3.0 4.1 1.3 versicolor
## 90 5.5 2.5 4.0 1.3 versicolor
## 91 5.5 2.6 4.4 1.2 versicolor
## 92 6.1 3.0 4.6 1.4 versicolor
## 93 5.8 2.6 4.0 1.2 versicolor
## 94 5.0 2.3 3.3 1.0 versicolor
## 95 5.6 2.7 4.2 1.3 versicolor
## 96 5.7 3.0 4.2 1.2 versicolor
## 97 5.7 2.9 4.2 1.3 versicolor
## 98 6.2 2.9 4.3 1.3 versicolor
## 99 5.1 2.5 3.0 1.1 versicolor
## 100 5.7 2.8 4.1 1.3 versicolor
## 101 6.3 3.3 6.0 2.5 virginica
## 102 5.8 2.7 5.1 1.9 virginica
## 103 7.1 3.0 5.9 2.1 virginica
## 104 6.3 2.9 5.6 1.8 virginica
## 105 6.5 3.0 5.8 2.2 virginica
## 106 7.6 3.0 6.6 2.1 virginica
## 107 4.9 2.5 4.5 1.7 virginica
## 108 7.3 2.9 6.3 1.8 virginica
## 109 6.7 2.5 5.8 1.8 virginica
## 110 7.2 3.6 6.1 2.5 virginica
## 111 6.5 3.2 5.1 2.0 virginica
## 112 6.4 2.7 5.3 1.9 virginica
## 113 6.8 3.0 5.5 2.1 virginica
## 114 5.7 2.5 5.0 2.0 virginica
## 115 5.8 2.8 5.1 2.4 virginica
## 116 6.4 3.2 5.3 2.3 virginica
## 117 6.5 3.0 5.5 1.8 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 120 6.0 2.2 5.0 1.5 virginica
## 121 6.9 3.2 5.7 2.3 virginica
## 122 5.6 2.8 4.9 2.0 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 124 6.3 2.7 4.9 1.8 virginica
## 125 6.7 3.3 5.7 2.1 virginica
## 126 7.2 3.2 6.0 1.8 virginica
## 127 6.2 2.8 4.8 1.8 virginica
## 128 6.1 3.0 4.9 1.8 virginica
## 129 6.4 2.8 5.6 2.1 virginica
## 130 7.2 3.0 5.8 1.6 virginica
## 131 7.4 2.8 6.1 1.9 virginica
## 132 7.9 3.8 6.4 2.0 virginica
## 133 6.4 2.8 5.6 2.2 virginica
## 134 6.3 2.8 5.1 1.5 virginica
## 135 6.1 2.6 5.6 1.4 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 137 6.3 3.4 5.6 2.4 virginica
## 138 6.4 3.1 5.5 1.8 virginica
## 139 6.0 3.0 4.8 1.8 virginica
## 140 6.9 3.1 5.4 2.1 virginica
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
a <- apply(iris[, 1:4], 2, mean) #前四列数据,按列,求均值
a## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
#tapply函数:将数据按照条件分组,生成类似列联表形式的数据结果
#tapply(X, INDEX, FUN = NULL, ..., default = NA, simplify = TRUE)
#X:数组、矩阵、数据框等分割型数据向量
#INDEX:一个或多个因子的列表,每个因子的长度都与x相同
#FUN: 自定义的调用函数
tapply(leadership$age, leadership$country, mean) # 求在不同country的age的均值## UK US
## NA 38.5
#求在不同country和gender交叉水平下的age的均值, 输出得到矩阵数据
tapply(leadership$age, list(leadership$country, leadership$gender), mean) ## F M
## UK NA 39
## US 45 32
#lapply函数: 对列表、数据框数据集进行循环,输入为列表,返回值为列表lapply(X, FUN, ...)
#X:列表、数据框
#FUN:自定义的调用函数
b <- list(x = 1:10, y = matrix(1:12, 3, 4))
b## $x
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $y
## [,1] [,2] [,3] [,4]
## [1,] 1 4 7 10
## [2,] 2 5 8 11
## [3,] 3 6 9 12
lapply(b, sum) # 求列表中各元素的和## $x
## [1] 55
##
## $y
## [1] 78
#sapply函数: 类似于lapply函数,但输入为列表,返回值为向量
#sapply(X, FUN, ..., )
#X:列表、矩阵、数据框
#FUN:自定义的调用函数
sapply(b, sum) # 求列表中各元素的和## x y
## 55 78
2.6 数据处理示例:成绩评估
options(digits=2)
Student <- c("John Davis", "Angela Williams",
"Bullwinkle Moose", "David Jones",
"Janice Markhammer", "Cheryl Cushing",
"Reuven Ytzrhak", "Greg Knox", "Joel England",
"Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(Student, Math, Science, English,
stringsAsFactors=FALSE)
z <- scale(roster[,2:4])
score <- apply(z, 1, mean)
score## [1] 0.56 0.92 -0.86 -1.16 -0.63 0.35 -1.05 1.34 0.70 -0.18
roster <- cbind(roster, score) #简单将列靠一起
roster## Student Math Science English score
## 1 John Davis 502 95 25 0.56
## 2 Angela Williams 600 99 22 0.92
## 3 Bullwinkle Moose 412 80 18 -0.86
## 4 David Jones 358 82 15 -1.16
## 5 Janice Markhammer 495 75 20 -0.63
## 6 Cheryl Cushing 512 85 28 0.35
## 7 Reuven Ytzrhak 410 80 15 -1.05
## 8 Greg Knox 625 95 30 1.34
## 9 Joel England 573 89 27 0.70
## 10 Mary Rayburn 522 86 18 -0.18
y <- quantile(score, c(.8, .6, .4, .2))
y## 80% 60% 40% 20%
## 0.74 0.44 -0.36 -0.89
roster$grade[score >= y[1]] <- "A"
roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"
roster$grade[score < y[4]] <- "F"
roster## Student Math Science English score grade
## 1 John Davis 502 95 25 0.56 B
## 2 Angela Williams 600 99 22 0.92 A
## 3 Bullwinkle Moose 412 80 18 -0.86 D
## 4 David Jones 358 82 15 -1.16 F
## 5 Janice Markhammer 495 75 20 -0.63 D
## 6 Cheryl Cushing 512 85 28 0.35 C
## 7 Reuven Ytzrhak 410 80 15 -1.05 F
## 8 Greg Knox 625 95 30 1.34 A
## 9 Joel England 573 89 27 0.70 B
## 10 Mary Rayburn 522 86 18 -0.18 C
name <- strsplit((roster$Student), " ") #返回列表
name## [[1]]
## [1] "John" "Davis"
##
## [[2]]
## [1] "Angela" "Williams"
##
## [[3]]
## [1] "Bullwinkle" "Moose"
##
## [[4]]
## [1] "David" "Jones"
##
## [[5]]
## [1] "Janice" "Markhammer"
##
## [[6]]
## [1] "Cheryl" "Cushing"
##
## [[7]]
## [1] "Reuven" "Ytzrhak"
##
## [[8]]
## [1] "Greg" "Knox"
##
## [[9]]
## [1] "Joel" "England"
##
## [[10]]
## [1] "Mary" "Rayburn"
lastname <- sapply(name, "[", 2) # "[" 函数的应用
lastname## [1] "Davis" "Williams" "Moose" "Jones" "Markhammer"
## [6] "Cushing" "Ytzrhak" "Knox" "England" "Rayburn"
firstname <- sapply(name, "[", 1)
firstname## [1] "John" "Angela" "Bullwinkle" "David" "Janice"
## [6] "Cheryl" "Reuven" "Greg" "Joel" "Mary"
roster <- cbind(firstname, lastname, roster[,-1])
roster## firstname lastname Math Science English score grade
## 1 John Davis 502 95 25 0.56 B
## 2 Angela Williams 600 99 22 0.92 A
## 3 Bullwinkle Moose 412 80 18 -0.86 D
## 4 David Jones 358 82 15 -1.16 F
## 5 Janice Markhammer 495 75 20 -0.63 D
## 6 Cheryl Cushing 512 85 28 0.35 C
## 7 Reuven Ytzrhak 410 80 15 -1.05 F
## 8 Greg Knox 625 95 30 1.34 A
## 9 Joel England 573 89 27 0.70 B
## 10 Mary Rayburn 522 86 18 -0.18 C
roster <- roster[order(lastname, firstname), ]
roster## firstname lastname Math Science English score grade
## 6 Cheryl Cushing 512 85 28 0.35 C
## 1 John Davis 502 95 25 0.56 B
## 9 Joel England 573 89 27 0.70 B
## 4 David Jones 358 82 15 -1.16 F
## 8 Greg Knox 625 95 30 1.34 A
## 5 Janice Markhammer 495 75 20 -0.63 D
## 3 Bullwinkle Moose 412 80 18 -0.86 D
## 10 Mary Rayburn 522 86 18 -0.18 C
## 2 Angela Williams 600 99 22 0.92 A
## 7 Reuven Ytzrhak 410 80 15 -1.05 F
# "[" 是一个函数
x <- 1:10
x## [1] 1 2 3 4 5 6 7 8 9 10
x[2]## [1] 2
"["(x,2)## [1] 2
2.7 控制流 Control flow
# 重复和循环
#for 结构
#for (var in seq) statement 如:
for (i in 1:10) print("Hello") ## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
#while结构
#while (cond) statement 如:
i <- 10
while( i>0 ) { print("Hello"); i <- i-1 }## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
## [1] "Hello"
#条件执行
#if-else结构
#if (cond) statement
#if (cond) statement1 else statement2
grade <- roster$grade
if (is.character(grade)) grade <- as.factor(grade)
grade## [1] C B B F A D D C A F
## Levels: A B C D F
grade <- roster$grade
if (!is.factor(grade)) grade <- as.factor(grade) else print("Grade already is a factor")
#ifelse结构: if-else结构的紧凑形式
#ifelse (cond, statement1, statement2)
#与 if (cond) statement1 else statement2 功能相同
#举例:
score## [1] 0.56 0.92 -0.86 -1.16 -0.63 0.35 -1.05 1.34 0.70 -0.18
ifelse(score > 0.5, print("Passed"), print("Failed"))## [1] "Passed"
## [1] "Failed"
## [1] "Passed" "Passed" "Failed" "Failed" "Failed" "Failed" "Failed" "Passed"
## [9] "Passed" "Failed"
outcome <- ifelse (score > 0.5, "Passed", "Failed")
outcome## [1] "Passed" "Passed" "Failed" "Failed" "Failed" "Failed" "Failed" "Passed"
## [9] "Passed" "Failed"
#条件执行
#switch结构: 根据表达式的值,选择执行
#switch (expr, list) expr为表达式,其值或为一个整数值或为一个字符串;
#list为一个列表。参数list是有名定义时,则当expr等于元素名时,
# 返回变量名对应的值,否则没有返回值。举例:
feelings <- c("sad", "afraid")
for (i in feelings)
print(
switch(i,
happy = "I am glad you are happy",
afraid = "There is nothing to fear",
sad = "Cheer up",
angry = "Calm down now"
)
)## [1] "Cheer up"
## [1] "There is nothing to fear"
feelings <- c(2, 4, 3, 1)
for (i in feelings)
print(
switch(i,
"I am glad you are happy",
"There is nothing to fear",
"Cheer up",
"Calm down now"
)
)## [1] "There is nothing to fear"
## [1] "Calm down now"
## [1] "Cheer up"
## [1] "I am glad you are happy"
#条件执行常被应用于自定义函数
mystats <- function(x, parametric=TRUE, print=FALSE)
{
if (parametric) { center <- mean(x); spread <- sd(x) }
else { center <- median(x); spread <- mad(x) }
if (print & parametric) { cat("Mean=", center, "\n", "SD=", spread, "\n") }
else if (print & !parametric) { cat("Median=", center, "\n", "MAD=", spread, "\n") }
result <- list(center=center, spread=spread)
return(result) }
# trying it out
set.seed(1234)
x <- rnorm(500)
y <- mystats(x)
y <- mystats(x, parametric=FALSE, print=TRUE)## Median= -0.021
## MAD= 1
# Another switch example
mydate <- function(type="long") {
switch(type,
long = format(Sys.time(), "%A %B %d %Y"),
short = format(Sys.time(), "%m-%d-%y"),
cat(type, "is not a recognized type\n"))
}
mydate("long")## [1] "星期一 二月 27 2023"
mydate("short")## [1] "02-27-23"
mydate()## [1] "星期一 二月 27 2023"
mydate("medium")## medium is not a recognized type
2.8 整合与重构 Aggregation and restructuring
#转置transpose 的实现: t( )
cars <- mtcars[1:5, 1:4]
cars## mpg cyl disp hp
## Mazda RX4 21 6 160 110
## Mazda RX4 Wag 21 6 160 110
## Datsun 710 23 4 108 93
## Hornet 4 Drive 21 6 258 110
## Hornet Sportabout 19 8 360 175
t(cars)## Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
## mpg 21 21 23 21 19
## cyl 6 6 4 6 8
## disp 160 160 108 258 360
## hp 110 110 93 110 175
#整合数据,非常重要。
#比如我们拥有逐小时的气象观测数据,试图得到每天的、每月的平均数据、全年逐时平均的日变化数据等等。
#在R中使用一个或多个by变量和一个预先定义好的函数来折叠(collapse)数据,达到整合的目的是非常容易的。
#调用格式为:
#aggregate(x, by, FUN)
#其中x是待折叠的数据对象,by是一个变量名组成的列表(依据它们归类形成新的观测量),而FUN则是用来计算描述性统计量的标量函数,它将被用来计算新观测中的值。
#按照具有相同(气缸cyl、档位gear)数值的观测进行归类,
#使用FUN=mean函数对所有变量数据进行平均,形成新的数据框aggdata
mtcars ## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21 6 160 110 3.9 2.6 16 0 1 4 4
## Mazda RX4 Wag 21 6 160 110 3.9 2.9 17 0 1 4 4
## Datsun 710 23 4 108 93 3.8 2.3 19 1 1 4 1
## Hornet 4 Drive 21 6 258 110 3.1 3.2 19 1 0 3 1
## Hornet Sportabout 19 8 360 175 3.1 3.4 17 0 0 3 2
## Valiant 18 6 225 105 2.8 3.5 20 1 0 3 1
## Duster 360 14 8 360 245 3.2 3.6 16 0 0 3 4
## Merc 240D 24 4 147 62 3.7 3.2 20 1 0 4 2
## Merc 230 23 4 141 95 3.9 3.1 23 1 0 4 2
## Merc 280 19 6 168 123 3.9 3.4 18 1 0 4 4
## Merc 280C 18 6 168 123 3.9 3.4 19 1 0 4 4
## Merc 450SE 16 8 276 180 3.1 4.1 17 0 0 3 3
## Merc 450SL 17 8 276 180 3.1 3.7 18 0 0 3 3
## Merc 450SLC 15 8 276 180 3.1 3.8 18 0 0 3 3
## Cadillac Fleetwood 10 8 472 205 2.9 5.2 18 0 0 3 4
## Lincoln Continental 10 8 460 215 3.0 5.4 18 0 0 3 4
## Chrysler Imperial 15 8 440 230 3.2 5.3 17 0 0 3 4
## Fiat 128 32 4 79 66 4.1 2.2 19 1 1 4 1
## Honda Civic 30 4 76 52 4.9 1.6 19 1 1 4 2
## Toyota Corolla 34 4 71 65 4.2 1.8 20 1 1 4 1
## Toyota Corona 22 4 120 97 3.7 2.5 20 1 0 3 1
## Dodge Challenger 16 8 318 150 2.8 3.5 17 0 0 3 2
## AMC Javelin 15 8 304 150 3.1 3.4 17 0 0 3 2
## Camaro Z28 13 8 350 245 3.7 3.8 15 0 0 3 4
## Pontiac Firebird 19 8 400 175 3.1 3.8 17 0 0 3 2
## Fiat X1-9 27 4 79 66 4.1 1.9 19 1 1 4 1
## Porsche 914-2 26 4 120 91 4.4 2.1 17 0 1 5 2
## Lotus Europa 30 4 95 113 3.8 1.5 17 1 1 5 2
## Ford Pantera L 16 8 351 264 4.2 3.2 14 0 1 5 4
## Ferrari Dino 20 6 145 175 3.6 2.8 16 0 1 5 6
## Maserati Bora 15 8 301 335 3.5 3.6 15 0 1 5 8
## Volvo 142E 21 4 121 109 4.1 2.8 19 1 1 4 2
options(digits=3)
attach(mtcars)## The following object is masked from package:ggplot2:
##
## mpg
aggdata <-aggregate(mtcars, by=list(cyl, gear), FUN=mean, na.rm=TRUE)
detach(mtcars)
aggdata## Group.1 Group.2 mpg cyl disp hp drat wt qsec vs am gear carb
## 1 4 3 21.5 4 120 97 3.70 2.46 20.0 1.0 0.00 3 1.00
## 2 6 3 19.8 6 242 108 2.92 3.34 19.8 1.0 0.00 3 1.00
## 3 8 3 15.1 8 358 194 3.12 4.10 17.1 0.0 0.00 3 3.08
## 4 4 4 26.9 4 103 76 4.11 2.38 19.6 1.0 0.75 4 1.50
## 5 6 4 19.8 6 164 116 3.91 3.09 17.7 0.5 0.50 4 4.00
## 6 4 5 28.2 4 108 102 4.10 1.83 16.8 0.5 1.00 5 2.00
## 7 6 5 19.7 6 145 175 3.62 2.77 15.5 0.0 1.00 5 6.00
## 8 8 5 15.4 8 326 300 3.88 3.37 14.6 0.0 1.00 5 6.00
#重构和整合数据集工具包——reshape2
#install.packages("reshape2")
library(reshape2)
mydatatxt <- "
ID Time X1 X2
1 1 5 6
1 2 3 5
2 1 6 1
2 2 2 4"
mydata <- read.table(header=TRUE, text = mydatatxt)
mydata## ID Time X1 X2
## 1 1 1 5 6
## 2 1 2 3 5
## 3 2 1 6 1
## 4 2 2 2 4
#数据融合(melting)
md <- melt(mydata, id=c("ID", "Time"))
md## ID Time variable value
## 1 1 1 X1 5
## 2 1 2 X1 3
## 3 2 1 X1 6
## 4 2 2 X1 2
## 5 1 1 X2 6
## 6 1 2 X2 5
## 7 2 1 X2 1
## 8 2 2 X2 4
#重铸 casting
#newdata <- cast(md, formula, FUN)
#md为已融合的数据,formula描述了想要的最后结果,而FUN是(可选的)数据整合函数
dcast(md, ID ~ variable, mean)## ID X1 X2
## 1 1 4 5.5
## 2 2 4 2.5
dcast(md, Time ~ variable, mean)## Time X1 X2
## 1 1 5.5 3.5
## 2 2 2.5 4.5
dcast(md, ID ~ Time, mean)## ID 1 2
## 1 1 5.5 4
## 2 2 3.5 3
dcast(md, ID+Time ~ variable)## ID Time X1 X2
## 1 1 1 5 6
## 2 1 2 3 5
## 3 2 1 6 1
## 4 2 2 2 4
dcast(md, ID + variable ~ Time)## ID variable 1 2
## 1 1 X1 5 3
## 2 1 X2 6 5
## 3 2 X1 6 2
## 4 2 X2 1 4
dcast(md, ID ~ variable + Time)## ID X1_1 X1_2 X2_1 X2_2
## 1 1 5 3 6 5
## 2 2 6 2 1 4
2.8.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.1
The End of Chapter Two #########################
3 作业
本次作业需要读入温度、相对湿度、PM2.5质量浓度数据(两个站点),一共4个文件,进行相应数据处理和绘图分析。 同学们可参考以下代码,有基础的同学也可尝试我们没讲到的内容,如ggplot包的绘图。
3.1 数据读取与处理
读取两站PM2.5数据,读取温度、湿度
read_data <- function(file_name, path='./data/', skip=3, header=TRUE, sep=',') {
return (read.table(paste(path, file_name, sep=''), skip=skip, header=header, sep=sep))
}networkpath <- 'http://162.105.145.16/rs/R_data_samples/'
#美使馆PM2.5观测数据:
aq_embassy <- read_data('AQ_FSPMC_USEmbassy.csv', networkpath, skip=3, header=TRUE)
#重命名aq_embassy数据的各列
colnames(aq_embassy) <- c('Time', 'Height', 'PMe', 'Source', 'Status', 'Method')
#时间从字符串转为POSIXlt格式,以便后续数据处理
aq_embassy$Time <- as.POSIXlt(aq_embassy$Time)
#万柳观测站PM2.5数据:
aq_wanliu <- read_data('AQ_FSPMC_WanLiu.csv', networkpath, skip=3, header=TRUE)
colnames(aq_wanliu) <- c('Time', 'Height', 'PMw', 'Source', 'Status', 'Method')
aq_wanliu$Time <- as.POSIXlt(aq_wanliu$Time)
rh <- read_data('A_RH.csv', networkpath, skip=3, header=TRUE)
colnames(rh) <- c('Time', 'Height', 'RH', 'Source', 'Status', 'Method')
rh$Time <- as.POSIXlt(rh$Time)
temp <- read_data('A_TEMP.csv', networkpath, skip=3, header=TRUE)
colnames(temp) <- c('Time', 'Height', 'Temperature', 'Source', 'ErrFlag', 'Details')
temp$Time <- as.POSIXlt(temp$Time)3.1.1 数据抽取
利用 subset 函数抽取2019-2022年、数据源为GTS_METAR的数据
基本格式:subset( origin_data , condition ,select=c(colname1,colname2,…)
)
#temp_sub <- dplyr::filter(temp, Time$year+1900 <= 2022)
temp_sub <- subset(temp, (Source == 'GTS_METAR') & (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'Temperature'))
rh_sub <- subset(rh, (Source == 'GTS_METAR') & (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'RH'))
embassy_sub <- subset(aq_embassy, (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'PMe'))
wanliu_sub <- subset(aq_wanliu, (Time$year+1900 >= 2019) & (Time$year+1900 <= 2022), select=c('Time', 'PMw'))3.1.2 数据重铸得到整小时数据
library(reshape2)
# 先获取每个DataFrame中小时的数据
# two <- merge(embassy_sub, wanliu_sub, by=c('Time'), all=TRUE)
# two1<- merge(embassy_sub, wanliu_sub, by.x = 'Time', by.y = 'Time', all=TRUE)
# two2<- merge(embassy_sub, wanliu_sub, by.embassy_sub = 'Time', by.wanliu_sub = 'Time')
mHourlyData <- data.frame(rh_sub)
#merge(data_A,data_B,by=c(colname1,colname2,...),all=___ )
#all=TRUE 对应:完整合并,将返回所有匹配与未匹配的数据
for (df in list(temp_sub, embassy_sub, wanliu_sub))
{
mHourlyData <- merge(mHourlyData, df, by=c('Time'), all=TRUE)
}
#下一步处理既可以使用aggregate(),也可以调用reshape2包中的melt()与dcast()来完成
#新增一个'Hour'变量,对应以小时为单位计数的时间
mHourlyData$Hour <- format(mHourlyData$Time, '%Y/%m/%d %H:00:00')
#利用'Hour'变量融合,得到中间数据
melted_data <- melt(subset(mHourlyData, select=-c(Time)), id='Hour')
#将中间数据与'Hour'变量关联起来,并且进行'mean'取平均值,na.rm=TRUE在运算中排除了缺失值
mHourlyData_ <- dcast(melted_data, Hour~variable, mean, na.rm=TRUE)
mHourlyData_$Hour <- as.POSIXlt(mHourlyData_$Hour)
#把变量改名, 包名称plyr::很重要!
mHourlyData_ <- plyr::rename(mHourlyData_, c(Hour = "Time"))
mHourlyData <- mHourlyData_
rm(mHourlyData_)整合后的小时级别数据如下
## Time RH Temperature PMe PMw
## 1 2019-01-01 00:00:00 50.0 -9.5 61 NaN
## 2 2019-01-01 01:00:00 54.0 -11.0 57 NaN
## 3 2019-01-01 02:00:00 56.0 -12.0 52 NaN
## 4 2019-01-01 03:00:00 60.7 -13.0 52 NaN
## 5 2019-01-01 04:00:00 58.3 -12.5 46 NaN
## 6 2019-01-01 05:00:00 58.1 -13.5 48 NaN
3.1.3 重铸为每日平均数据
mHourlyData_ <- data.frame(mHourlyData)
mHourlyData_$Day <- substr(mHourlyData_$Time, 1, 10)
melted_data <- melt(subset(mHourlyData_, select=-c(Time)), id='Day')
mDailyData <- dcast(melted_data, Day~variable, mean, na.rm = TRUE )
mDailyData$Day <- as.POSIXlt(mDailyData$Day)
#Another way to replace the above 3 lines:
#mHourlyData_$Date <- as.POSIXlt( mHourlyData$Time, format = "%Y/%m/%d %H:%M:%S")
#mDailyData <- aggregate(mHourlyData_[2:5], by = list(day = mHourlyData_$Day), FUN=mean, na.rm=TRUE)日平均后的数据如下
## Day RH Temperature PMe PMw
## 1 2019-01-01 40.0 -5.90 33.1 16.0
## 2 2019-01-02 40.2 -6.04 54.7 100.4
## 3 2019-01-03 47.1 -5.63 128.2 111.0
## 4 2019-01-04 22.4 -2.04 20.2 21.7
## 5 2019-01-05 22.5 -4.25 13.0 16.9
## 6 2019-01-06 41.8 -7.02 56.5 57.7
在 mDailyData 数据中增加 PM2.5 差值 (大使馆-万柳)
mDailyData$aq_diff <- with (mDailyData, {
PMe - PMw
})根据国家标准在每日数据中,增加 PM2.5 等级因子变量
aq_factor <- function(mpc) {
mpc_ <- replicate(1, mpc)
#创建副本
mpc_[mpc<=35] = 'excellent'
mpc_[mpc>35 & mpc<=75] = 'good'
mpc_[mpc>75 & mpc<=115] = 'lightly polluted'
mpc_[mpc>115 & mpc<=150] = 'moderately polluted'
mpc_[mpc>150 & mpc<=250] = 'heavily polluted'
mpc_[mpc>250] = 'severely polluted'
return (factor(mpc_, ordered=TRUE, levels=c('excellent', 'good', 'lightly polluted', 'moderately polluted', 'heavily polluted', 'severely polluted')))
}
mDailyData$embassy_lvl <- aq_factor(mDailyData$PMe)
mDailyData$wanliu_lvl <- aq_factor(mDailyData$PMw)
#可以将处理后的数据写入文件储存,便于以后直接从日均数据开始数据处理
write.table(mDailyData, "mDailyData.csv", row.names=FALSE, col.names=TRUE, sep="," )3.1.4 计算月平均值
mMonthlyData <- subset(data.frame(mDailyData), select=-c(embassy_lvl, wanliu_lvl))
#mMonthlyData$Year <- as.integer(substr(mMonthlyData$Day, 1, 4))
mMonthlyData$Month <- as.integer(substr(mMonthlyData$Day, 6, 7))
#月份为整数
#melted_data <- melt(subset(mMonthlyData, select=-c(Day)), id=list('Year', 'Month'))
melted_data <- melt(subset(mMonthlyData, select=-c(Day)), id=list('Month'))
#mMonthlyData <- dcast(melted_data, Year + Month ~ variable, mean, na.rm = TRUE)
mMonthlyData <- dcast(melted_data, Month ~ variable, mean, na.rm = TRUE)
#na.rm=TRUE 剔除缺省数据
#writing the data into a file
write.table(mMonthlyData, "mMonthlyData.csv", row.names=FALSE, col.names=TRUE, sep="," )月均数据结果如下
## Month RH Temperature PMe PMw aq_diff
## 1 1 45.7 -2.9222 51.8 49.9 0.450
## 2 2 46.8 0.0412 52.4 52.1 -0.181
## 3 3 44.9 8.3289 52.9 54.5 -1.592
## 4 4 41.2 15.0118 42.5 37.5 5.019
## 5 5 47.6 20.9927 39.6 28.3 9.987
## 6 6 57.3 25.7958 32.8 30.1 4.252
## 7 7 73.4 26.8397 26.6 29.0 -2.517
## 8 8 72.7 25.6338 21.4 22.7 -1.331
## 9 9 69.0 21.7591 30.0 27.4 2.610
## 10 10 61.0 12.4380 38.9 36.9 2.560
## 11 11 57.0 5.5273 44.9 42.7 2.250
## 12 12 46.3 -2.4578 30.6 29.8 0.820
3.1.5 全年平均“日变化”
利用dcast, melt
函数获得变量多年平均的日变化(只有24个小时)数据框
mHourlyData_ <- data.frame(mHourlyData)
mHourlyData_$Hour <- as.integer(substr(mHourlyData_$Time, 12, 13))
melted_data <- melt(subset(mHourlyData_, select=-c(Time)), id='Hour')
#add na.rm=TRUE
mDiurnalData <- dcast(melted_data, Hour~variable, mean, na.rm=TRUE)
mydf <- subset(mHourlyData, Time$hour==3, select=c(Temperature))
mean(mydf$Temperature, na.rm = TRUE)## [1] 8.82
所得日变化数据如下所示
## Hour RH Temperature PMe PMw
## 1 0 66.3 10.29 42.7 38.6
## 2 1 68.1 9.72 42.6 38.2
## 3 2 69.7 9.23 41.5 37.7
## 4 3 70.9 8.82 40.5 37.2
## 5 4 72.1 8.44 39.1 36.6
## 6 5 72.8 8.21 37.9 36.5
3.2 作图部分
颜色自定义: 胭脂红#F03F24 云山蓝#2F90B9 蔚蓝#29B7CB
3.2.1 日均浓度及差值
两观测站日均 PM2.5 浓度图:
opar<-par(pin=c(6,4), mar=c(4,4,3,1))
mDailyData$Day <- as.Date(mDailyData$Day)
x_label<-seq(from=as.Date("2019-01-01"), to=as.Date("2022-12-31"), by=365)
plot(mDailyData$Day, mDailyData$PMe, xaxt="n", yaxt="n", col=rgb(255, 100, 100, maxColorValue=255), type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0, xlab='Date', ylab = "PM2.5(ug/m3)", ylim=c(0, 300), main="PM2.5 mass concentration in Beijing (2019/01/01 - 2022/12/31)")
lines(mDailyData$Day, mDailyData$PMw, pch=18, cex=0.8, type="b", lty=3, col=rgb(100, 100, 255, maxColorValue=255))
axis(2, at=seq(0, 300, 20), col.axis="black", las=1, cex.axis=0.8, tck=-0.01)
axis(1, x_label, format(x_label,"%Y-%m-%d"), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)
legend("topleft", ncol=1, inset = .02, c("US Embassy", "Wan Liu"), lty=c(1, 3), pch=c(20, 18), col=c(rgb(255, 100, 100, maxColorValue=255), rgb(100, 100, 255, maxColorValue=255)))par(opar)两观测站日均 PM2.5 浓度之差散点图:
opar<-par(pin=c(2.8,2.8))
diff <- na.omit(mDailyData$aq_diff)
rms <- sqrt(mean(diff**2))
tcor <- cor(mDailyData$PMw, mDailyData$PMe, use = "pairwise.complete.obs")
label <- sprintf('R = %.2f\nN = %d\nRMS=%.1f', tcor, length(diff), rms)
plot(mDailyData$PMw, mDailyData$PMe, asp = 1.0, col=rgb(255, 100, 100, maxColorValue=255),
type="b", lty=0, cex.main=1.0, pch=20, cex=1.0, cex.lab = 1.0,
xlab='Wan Liu', ylab = "US Embassy", xlim = c(0, 300), ylim=c(0, 300),
main="PM2.5 mass concentration differences\nbetween two stations", xaxs="i", yaxs="i")
#text(40,270, family="mono", label, cex = 0.8)
legend("topleft", ncol=1,inset=0.03, legend = c("R=0.95", "N=1038", "RMS=11.1"), cex=0.8)
abline(lm(mDailyData$PMw ~ mDailyData$PMe), lty =3, col = "blue", lwd = 2.0)par(opar)分析: 在多数情况,两测量点的污染物浓度差异并不大(差异绝对值在 20 以内), 但也存在个别天差异较大
大使馆和万柳污染物浓度数据呈现高度的相关性(相关系数为 \(R=0.95\) 已经标注在图中)
3.2.2 全年污染等级统计饼图
mDailyData_ <- subset(mDailyData, select=c(wanliu_lvl))
mDailyData_ <- na.omit(mDailyData_ )
#包plyr::指定很重要!
wanliu_cnt <- plyr::count(mDailyData_, 'wanliu_lvl')
mDailyData_ <- subset(mDailyData, select=c(embassy_lvl))
mDailyData_ <- na.omit(mDailyData_ )
embassy_cnt <- plyr::count(mDailyData_, 'embassy_lvl')
wanliu_cnt$pct <- with(wanliu_cnt, 100*freq/sum(freq, na.rm=TRUE))
embassy_cnt$pct <- with(embassy_cnt, 100*freq/sum(freq, na.rm=TRUE))
aq_colors <- c('#00E400', '#FFFF00', '#FF7E00', '#FF0000', '#99004C', '#7E0023')
opar <- par(no.readonly=FALSE)
layout(matrix(c(4, 4, 1, 2, 3, 3), 3, 2, byrow=TRUE), widths=c(1, 1), heights=c(0.2, 1, 0.1))
par(mar=c(0, 0.5, 0, 0), cex.sub=1.4)
pie(wanliu_cnt$freq, sprintf('%.1f%%', wanliu_cnt$pct), col=aq_colors, clockwise=FALSE, radius=0.75)
title(sub='Wanliu', line=-2)
par(mar=c(0, 0, 0, 0.5), cex.sub=1.4)
pie(embassy_cnt$freq, sprintf('%.1f%%', embassy_cnt$pct), col=aq_colors, clockwise=FALSE, radius=0.75)
title(sub='US Embassy', line=-2)
plot.new()
par(mar=c(0, 0, 0, 0), cex=0.8)
legend(x='center', legend=levels(wanliu_cnt$wanliu_lvl), cex=0.7, fill=aq_colors, ncol=6)
plot.new()
par(mar=c(0, 0, 0, 0), cex.main=2)
title(main='Air Quality in Beijing',line=-4)分析: 超过一半天数为优,约 1/3 天数为良,只有少于 1/6 天数存在轻度及以上的污染
3.2.3 月平均柱状图
mMonthlyData_ <- subset(mMonthlyData, select=c(Month, PMe, PMw))
opar <- par(no.readonly=TRUE)
par (mar=c(5,8,4,2)) #mar=c(bottom, left, top, right)
barplot(data = mMonthlyData_, cbind(PMw, PMe) ~ Month, beside = TRUE, col=c("green","yellow"),
names.arg= as.character(mMonthlyData_$Month), main="Monthly PM2.5 mass concentration in Beijing",
xlab="Date", ylab = "PM2.5", border = "red")par(opar)分析: 高污染主要集中在冬季, 而夏季空气质量相对较好
3.2.4 多年平均温度日变化图
opar<-par(pin=c(6,4), mar=c(4,4,3,1))
plot(mDiurnalData$Hour, mDiurnalData$Temperature, xaxt="n", yaxt="n", col=rgb(255, 100, 100, maxColorValue=255), type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0, xlab='Time', ylab = "Temperature", xlim = c(0, 24), ylim=c(5, 20), main="Diurnal Change of Temperature in Beijing")
axis(2, at=seq(5, 20, 1), col.axis="black", las=1, cex.axis=0.8, tck=-0.01)
axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)par(opar)3.2.5 多年平均相对湿度日变化图
opar<-par(pin=c(6,4), mar=c(4,4,3,1))
plot(mDiurnalData$Hour, mDiurnalData$RH, xaxt="n", yaxt="n", col="#2233FF", type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0, xlab='Time', ylab = "Relative HUmidity", xlim = c(0, 23), ylim=c(30, 80), main="Diurnal Change of Relative HUmidity in Beijing")
axis(2, at=seq(30, 80, 5), col.axis="black", las=1, cex.axis=0.8, tck=-0.01)
axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)par(opar)分析: 日均相对湿度最大值出现在 5 时(与气温最低时间较为接近), 最低值出现在 15 时(与最高气温时间较为接近)
3.2.6 多年平均温度、相对湿度日变化叠加图
opar<-par(pin=c(6,4), mar=c(5,5,3,5))
cut <- 5.0
plot(mDiurnalData$Hour, mDiurnalData$Temperature, xaxt="n", yaxt="n", col=rgb(255, 100, 100, maxColorValue=255), type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0, xlab='Time', ylab = "Temperature", xlim = c(0, 23), ylim=c(0, 20), main="Diurnal Change of Temperature & Relative Humidity in Beijing")
lines(mDiurnalData$Hour, mDiurnalData$RH/cut, xaxt="n", yaxt="n", col="#2233FF", type="b", lty =2, pch=21)
axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)
axis(2, at=seq(0, 20, 1), col.axis="black", las=2, cex.axis=0.8, tck=-0.01)
mylabels <- as.character(seq(0, 20, 2)*cut)
axis(side=4, at=seq(0, 20, 2), labels = mylabels, col.axis="black", cex.axis=0.8, tck=-0.01)
mtext("Relative Humidity", side = 4, line = 3)
legend("bottomright", ncol=1, inset = .02, c("Temperature", "Relative Humidity"),
col = c("red", "blue"), lty = c(1, 2), pch = c(20, 21), cex=0.8)par(opar)3.2.7 多年平均 PM2.5 日变化图
opar<-par(pin=c(6,4), mar=c(5,5,3,5))
plot(mDiurnalData$Hour, mDiurnalData$PMe, xaxt="n", yaxt="n", col="#ff8080", type="b", lty =1, cex.main=1.2, pch=20, cex=1.0, cex.lab = 1.0,lwd=2, xlab='Time', ylab = "PM2.5 (ug/m3)", xlim = c(0, 23), ylim=c(34, 46), main="Diurnal Change of PM2.5 mass concentration in Beijing")
lines(mDiurnalData$Hour, mDiurnalData$PMw, xaxt="n", yaxt="n", col="#2233FF", type="b", lty =2, pch=21, lwd = 2)
axis(1, at=seq(0, 24, 1), las=1, col.axis="black", cex.axis=0.8, tck=-0.01)
axis(2, at=seq(34, 46, 2), col.axis="black", las=2, cex.axis=0.8, tck=-0.01)
legend("top", ncol=1, inset = .02, c("US Embassy", "Haidian/Wanliu"),
col = c("red", "blue"), lty = c(1, 2), pch = c(20, 21), lwd=c(2,2), cex=0.8)par(opar)分析: 污染物浓度在 13时 至 17时之间较低, 在 22时 至 (次日)1 时之间达到高峰
The End