R-data-science-4

R基础

#Sys.setlocale('LC_ALL','C') 
##如遇到+号可继续输入或ESC退出输入
##小技巧，连续执行赋值语句并输出结果到屏幕
(y <- seq(1, 10, length.out = 5))#简化操作

## [1]  1.00  3.25  5.50  7.75 10.00

##在编程时，即便一点细微的差别也会导致程序无法继续运行
##Alt+Shift+K组合键显示快捷键查询表

使用dplyr进行数据转换

使用新的数据内置数据集，ncflights13

library(nycflights13)
library(tidyverse)

## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --

## √ ggplot2 3.1.0     √ purrr   0.2.5
## √ tibble  1.4.2     √ dplyr   0.7.8
## √ tidyr   0.8.2     √ stringr 1.3.1
## √ readr   1.3.1     √ forcats 0.3.0

## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

dim(nycflights13::flights)

## [1] 336776     19

head(nycflights13::flights)

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

##flights数据集是一个tibble,也是一种数据框，格式
#int表示整数，dbl实数，chr字符向量，dttm日期时间，lgl逻辑变量，fctr因子,date日期型

##dplyr基础-5个核心函数
#filter按值筛选观测
#arrange对行重新排序
#select按名称选取变量
#mutate使用现有变量的函数创建新变量
#多个值总结为一个摘要统计量
#可与group_by函数联用，可与将整个数据集的操作变为在每个分组进行
#工作方式：参数1：数据框，参数2：不带引号的变量，输出结果：数据框

#使用filter进行筛选
filter(flights, month == 1, day == 1)

## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

#但dplyr不修改输入，需要自己赋值保存变量，要么输出结果
jan1<-filter(flights,month==1,day==1)
#小技巧,同时完成输出结果并保存,用括号包裹
(dec25 <- filter(flights, month == 12, day == 25))

## # A tibble: 719 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12    25      456            500        -4      649
##  2  2013    12    25      524            515         9      805
##  3  2013    12    25      542            540         2      832
##  4  2013    12    25      546            550        -4     1022
##  5  2013    12    25      556            600        -4      730
##  6  2013    12    25      557            600        -3      743
##  7  2013    12    25      557            600        -3      818
##  8  2013    12    25      559            600        -1      855
##  9  2013    12    25      559            600        -1      849
## 10  2013    12    25      600            600         0      850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##比较运算符
# > >= < <= != ==
#比较浮点数相等时，应该使用near,而不是==
#如下
1/49*49==49##得到FALSE，计算机使用的是有限精度运算

## [1] FALSE

##易错点11或12月份的表达，应该是两次month
filter(flights,month==11|month==12)

## # A tibble: 55,403 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    11     1        5           2359         6      352
##  2  2013    11     1       35           2250       105      123
##  3  2013    11     1      455            500        -5      641
##  4  2013    11     1      539            545        -6      856
##  5  2013    11     1      542            545        -3      831
##  6  2013    11     1      549            600       -11      912
##  7  2013    11     1      550            600       -10      705
##  8  2013    11     1      554            600        -6      659
##  9  2013    11     1      554            600        -6      826
## 10  2013    11     1      554            600        -6      749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

#以下代码表示的是找出1月份的（易错）
filter(flights,month==11|12)##此写法错误，代码中11|12逻辑值是TURE,TURE在数字语境中就是1

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##有用的小技巧x %in% y 找出x是y中的一个值时的所有行
nov_dec<-filter(flights,month %in% c(11,12))

##缺失值NA(not available)可传染的
#is.na()函数，确定一个值是否为NA
##filter只能筛选出条件为TRUE的行，FALSE与NA排除
(df<-tibble(x=c(1,NA,3)))

## # A tibble: 3 x 1
##       x
##   <dbl>
## 1     1
## 2    NA
## 3     3

filter(df,x>1)#筛选大于1的变量

## # A tibble: 1 x 1
##       x
##   <dbl>
## 1     3

filter(df,is.na(x)|x>1)#保留NA

## # A tibble: 2 x 1
##       x
##   <dbl>
## 1    NA
## 2     3

arrange()函数，不是选择行，而是改变行的顺序

接受参数为数据框，排序依据的列名或多个

arrange(flights, year, month, day)#默认升序，根据年月日排序

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

#desc()设置降序
arrange(flights, desc(dep_delay))#

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     6    15     1432           1935      1137     1607
##  3  2013     1    10     1121           1635      1126     1239
##  4  2013     9    20     1139           1845      1014     1457
##  5  2013     7    22      845           1600      1005     1044
##  6  2013     4    10     1100           1900       960     1342
##  7  2013     3    17     2321            810       911      135
##  8  2013     6    27      959           1900       899     1236
##  9  2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

#缺失值总是排在最后
df <- tibble(x = c(5, 2, NA))
arrange(df, x)

## # A tibble: 3 x 1
##       x
##   <dbl>
## 1     2
## 2     5
## 3    NA

arrange(df, desc(x))##设置降序仍然缺失值排在最后

## # A tibble: 3 x 1
##       x
##   <dbl>
## 1     5
## 2     2
## 3    NA

##如何将缺失值排在最前面
arrange(df,desc(is.na(x)))#

## # A tibble: 3 x 1
##       x
##   <dbl>
## 1    NA
## 2     5
## 3     2

select()函数选择列，找出自己感兴趣的变量子集

select(flights,year,month,day)#选取年月日三个变量

## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

#选择year与day之间的所有列
select(flights,year:day)

## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows

#选择不在year与day之间的所有列
select(flights,-(year:day))

## # A tibble: 336,776 x 16
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>
##  1      517            515         2      830            819        11
##  2      533            529         4      850            830        20
##  3      542            540         2      923            850        33
##  4      544            545        -1     1004           1022       -18
##  5      554            600        -6      812            837       -25
##  6      554            558        -4      740            728        12
##  7      555            600        -5      913            854        19
##  8      557            600        -3      709            723       -14
##  9      557            600        -3      838            846        -8
## 10      558            600        -2      753            745         8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

##select函数中的一些辅助函数
#starts_with("abc"):匹配以abc开头的名称
#ends_with("xyz"):匹配以xyz结尾的名称
#contains("ijk"):匹配包含ijk
#matches("(.)\\1"):匹配正则表达式(这里表示的是重复字符)
#num_range("x",1:3):匹配x1,x2,和x3

##rename()函数重命名变量
rename(flights,tail_num=tailnum)#重命名变量

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##移动变量在数据框中的位置,everything函数
select(flights, time_hour, air_time, everything())#移动time_hour变量到开头

## # A tibble: 336,776 x 19
##    time_hour           air_time  year month   day dep_time sched_dep_time
##    <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
##  1 2013-01-01 05:00:00      227  2013     1     1      517            515
##  2 2013-01-01 05:00:00      227  2013     1     1      533            529
##  3 2013-01-01 05:00:00      160  2013     1     1      542            540
##  4 2013-01-01 05:00:00      183  2013     1     1      544            545
##  5 2013-01-01 06:00:00      116  2013     1     1      554            600
##  6 2013-01-01 05:00:00      150  2013     1     1      554            558
##  7 2013-01-01 06:00:00      158  2013     1     1      555            600
##  8 2013-01-01 06:00:00       53  2013     1     1      557            600
##  9 2013-01-01 06:00:00      140  2013     1     1      557            600
## 10 2013-01-01 06:00:00      138  2013     1     1      558            600
## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## #   arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## #   hour <dbl>, minute <dbl>

##练习
select(flights,air_time,air_time)

## # A tibble: 336,776 x 1
##    air_time
##       <dbl>
##  1      227
##  2      227
##  3      160
##  4      183
##  5      116
##  6      150
##  7      158
##  8       53
##  9      140
## 10      138
## # ... with 336,766 more rows

select(flights,contains("TIME"))#

## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time
##       <int>          <int>    <int>          <int>    <dbl>
##  1      517            515      830            819      227
##  2      533            529      850            830      227
##  3      542            540      923            850      160
##  4      544            545     1004           1022      183
##  5      554            600      812            837      116
##  6      554            558      740            728      150
##  7      555            600      913            854      158
##  8      557            600      709            723       53
##  9      557            600      838            846      140
## 10      558            600      753            745      138
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>

mutate()函数添加新变量，现有数据集已外的变量

##先选取一个子集
(flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
))

## # A tibble: 336,776 x 7
##     year month   day dep_delay arr_delay distance air_time
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl>
##  1  2013     1     1         2        11     1400      227
##  2  2013     1     1         4        20     1416      227
##  3  2013     1     1         2        33     1089      160
##  4  2013     1     1        -1       -18     1576      183
##  5  2013     1     1        -6       -25      762      116
##  6  2013     1     1        -4        12      719      150
##  7  2013     1     1        -5        19     1065      158
##  8  2013     1     1        -3       -14      229       53
##  9  2013     1     1        -3        -8      944      140
## 10  2013     1     1        -2         8      733      138
## # ... with 336,766 more rows

##添加新变量gain,speed-默认添加到数据集的最后
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)

## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9  370.
##  2  2013     1     1         4        20     1416      227   -16  374.
##  3  2013     1     1         2        33     1089      160   -31  408.
##  4  2013     1     1        -1       -18     1576      183    17  517.
##  5  2013     1     1        -6       -25      762      116    19  394.
##  6  2013     1     1        -4        12      719      150   -16  288.
##  7  2013     1     1        -5        19     1065      158   -24  404.
##  8  2013     1     1        -3       -14      229       53    11  259.
##  9  2013     1     1        -3        -8      944      140     5  405.
## 10  2013     1     1        -2         8      733      138   -10  319.
## # ... with 336,766 more rows

##新变量一旦创建就可以使用
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

## # A tibble: 336,776 x 10
##     year month   day dep_delay arr_delay distance air_time  gain hours
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9 3.78 
##  2  2013     1     1         4        20     1416      227   -16 3.78 
##  3  2013     1     1         2        33     1089      160   -31 2.67 
##  4  2013     1     1        -1       -18     1576      183    17 3.05 
##  5  2013     1     1        -6       -25      762      116    19 1.93 
##  6  2013     1     1        -4        12      719      150   -16 2.5  
##  7  2013     1     1        -5        19     1065      158   -24 2.63 
##  8  2013     1     1        -3       -14      229       53    11 0.883
##  9  2013     1     1        -3        -8      944      140     5 2.33 
## 10  2013     1     1        -2         8      733      138   -10 2.3  
## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>

##如果只想保留新变量使用transmute()函数
transmute(flights,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

## # A tibble: 336,776 x 3
##     gain hours gain_per_hour
##    <dbl> <dbl>         <dbl>
##  1    -9 3.78          -2.38
##  2   -16 3.78          -4.23
##  3   -31 2.67         -11.6 
##  4    17 3.05           5.57
##  5    19 1.93           9.83
##  6   -16 2.5           -6.4 
##  7   -24 2.63          -9.11
##  8    11 0.883         12.5 
##  9     5 2.33           2.14
## 10   -10 2.3           -4.35
## # ... with 336,766 more rows

##模运算符%/%整数除法, %%求余
##lead(),lag()函数
(x <- 1:10)

##  [1]  1  2  3  4  5  6  7  8  9 10

lead(x)

##  [1]  2  3  4  5  6  7  8  9 10 NA

lag(x)

##  [1] NA  1  2  3  4  5  6  7  8  9

x-lag(x)#序列移动差值

##  [1] NA  1  1  1  1  1  1  1  1  1

(x!=lag(x))#序列何时发生变化

##  [1]   NA TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

##R中的累加和,累加积，累加最小值，累加最大值，dplyr提供累加均值
x

##  [1]  1  2  3  4  5  6  7  8  9 10

cumsum(x)#依次累加

##  [1]  1  3  6 10 15 21 28 36 45 55

cummean(x)#依次累加求均值

##  [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5

##排序
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)#默认升序

## [1]  1  2  2 NA  4  5

min_rank(desc(y))#desc降序

## [1]  5  3  3 NA  2  1

row_number(y)#返回排序的位置

## [1]  1  2  3 NA  4  5

使用summarize()进行分组摘要-折叠数据框的作用

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))

## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6

#如果不与group联用，则没有突出效果-分组折叠的强大功能
(by_day <- group_by(flights, year, month, day))#按年月日分组

## # A tibble: 336,776 x 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

(summarise(by_day, delay = mean(dep_delay, na.rm = TRUE)))#折叠效果就此体现

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

使用管道组合多种操作

##按dest变量分组，不使用管道组合
(by_dest <- group_by(flights, dest))

## # A tibble: 336,776 x 19
## # Groups:   dest [105]
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

(delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
))

## # A tibble: 105 x 4
##    dest  count  dist  delay
##    <chr> <int> <dbl>  <dbl>
##  1 ABQ     254 1826    4.38
##  2 ACK     265  199    4.85
##  3 ALB     439  143   14.4 
##  4 ANC       8 3370   -2.5 
##  5 ATL   17215  757.  11.3 
##  6 AUS    2439 1514.   6.02
##  7 AVL     275  584.   8.00
##  8 BDL     443  116    7.05
##  9 BGR     375  378    8.03
## 10 BHM     297  866.  16.9 
## # ... with 95 more rows

(delay <- filter(delay, count > 20, dest != "HNL"))#对行操作

## # A tibble: 96 x 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254 1826   4.38
##  2 ACK     265  199   4.85
##  3 ALB     439  143  14.4 
##  4 ATL   17215  757. 11.3 
##  5 AUS    2439 1514.  6.02
##  6 AVL     275  584.  8.00
##  7 BDL     443  116   7.05
##  8 BGR     375  378   8.03
##  9 BHM     297  866. 16.9 
## 10 BNA    6333  758. 11.8 
## # ... with 86 more rows

ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)#se阴影带

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##以上的代码较复杂，因为要对中间的变量命名，影响分析效率

##使用管道组合 %>%
delays <- flights %>% 
  group_by(dest) %>% ##本该重新命名新变量
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% ##本该命名新变量，管道解决
  filter(count > 20, dest != "HNL")
## %>%可读作然后，重点在于转换的过程而不是对象
##自己的理解，是一种将变量融合进当前对象的思想
##如 x %% f(y),转换为f(x,y)
delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")
##ggplo2暂不支持管道操作

##缺失值一般原则，输入有缺失，输出也有na.rm参数计算前移除缺失
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))##得到很多NA

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1    NA
##  2  2013     1     2    NA
##  3  2013     1     3    NA
##  4  2013     1     4    NA
##  5  2013     1     5    NA
##  6  2013     1     6    NA
##  7  2013     1     7    NA
##  8  2013     1     8    NA
##  9  2013     1     9    NA
## 10  2013     1    10    NA
## # ... with 355 more rows

##
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay, na.rm = TRUE))#先移除

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

##
not_cancelled<-flights %>%
   filter(!is.na(dep_delay),!is.na(arr_delay))
   
(not_cancelled %>% 
  group_by(year,month,day) %>%
  summarize(mean=mean(dep_delay)))

## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ... with 355 more rows

计数n(), sum(!is_na)非缺失值计数

##根据tailnum分组，计算延误时间均值
(delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  ))

## # A tibble: 4,037 x 2
##    tailnum   delay
##    <chr>     <dbl>
##  1 D942DN   31.5  
##  2 N0EGMQ    9.98 
##  3 N10156   12.7  
##  4 N102UW    2.94 
##  5 N103US   -6.93 
##  6 N104UW    1.80 
##  7 N10575   20.7  
##  8 N105UW   -0.267
##  9 N107US   -5.73 
## 10 N108UW   -1.25 
## # ... with 4,027 more rows

##delay映射到x
ggplot(data = delays, mapping = aes(x = delay)) + 
  geom_freqpoly(binwidth = 10)

##计数航班数量
delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()##计数
  )
delays

## # A tibble: 4,037 x 3
##    tailnum   delay     n
##    <chr>     <dbl> <int>
##  1 D942DN   31.5       4
##  2 N0EGMQ    9.98    352
##  3 N10156   12.7     145
##  4 N102UW    2.94     48
##  5 N103US   -6.93     46
##  6 N104UW    1.80     46
##  7 N10575   20.7     269
##  8 N105UW   -0.267    45
##  9 N107US   -5.73     41
## 10 N108UW   -1.25     60
## # ... with 4,027 more rows

##将x变量对应x轴,y对应延误时间,散点图
ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
  geom_point(alpha = 1/10)

##将ggplot2集成到dplyr工作流流的方法要领 %>% 过渡到+ （重要技巧）
delays %>% 
  filter(n > 25) %>% #筛选行
  ggplot(mapping = aes(x = n, y = delay)) +  ##实际上将dplyr工作流内化到data参数
    geom_point(alpha = 1/10)

#### Rstudio技巧：Ctrl+Shift+P组合将上次的代码再次从编辑器发送到Console

##换个数据集
dim(Lahman::Batting)

## [1] 102816     22

head(Lahman::Batting)

##    playerID yearID stint teamID lgID  G  AB  R  H X2B X3B HR RBI SB CS BB
## 1 abercda01   1871     1    TRO   NA  1   4  0  0   0   0  0   0  0  0  0
## 2  addybo01   1871     1    RC1   NA 25 118 30 32   6   0  0  13  8  1  4
## 3 allisar01   1871     1    CL1   NA 29 137 28 40   4   5  0  19  3  1  2
## 4 allisdo01   1871     1    WS3   NA 27 133 28 44  10   2  2  27  1  1  0
## 5 ansonca01   1871     1    RC1   NA 25 120 29 39  11   3  0  16  6  2  2
## 6 armstbo01   1871     1    FW1   NA 12  49  9 11   2   1  0   5  0  1  0
##   SO IBB HBP SH SF GIDP
## 1  0  NA  NA NA NA   NA
## 2  0  NA  NA NA NA   NA
## 3  5  NA  NA NA NA   NA
## 4  2  NA  NA NA NA   NA
## 5  1  NA  NA NA NA   NA
## 6  1  NA  NA NA NA   NA

class(Lahman::Batting)##常规数据框

## [1] "data.frame"

##转换为tibble格式，输出更美观
(batting<-as_tibble(Lahman::Batting))

## # A tibble: 102,816 x 22
##    playerID yearID stint teamID lgID      G    AB     R     H   X2B   X3B
##    <chr>     <int> <int> <fct>  <fct> <int> <int> <int> <int> <int> <int>
##  1 abercda~   1871     1 TRO    NA        1     4     0     0     0     0
##  2 addybo01   1871     1 RC1    NA       25   118    30    32     6     0
##  3 allisar~   1871     1 CL1    NA       29   137    28    40     4     5
##  4 allisdo~   1871     1 WS3    NA       27   133    28    44    10     2
##  5 ansonca~   1871     1 RC1    NA       25   120    29    39    11     3
##  6 armstbo~   1871     1 FW1    NA       12    49     9    11     2     1
##  7 barkeal~   1871     1 RC1    NA        1     4     0     1     0     0
##  8 barnero~   1871     1 BS1    NA       31   157    66    63    10     9
##  9 barrebi~   1871     1 FW1    NA        1     5     1     1     1     0
## 10 barrofr~   1871     1 BS1    NA       18    86    13    13     2     1
## # ... with 102,806 more rows, and 11 more variables: HR <int>, RBI <int>,
## #   SB <int>, CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>,
## #   SH <int>, SF <int>, GIDP <int>

(batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),##能力
    ab = sum(AB, na.rm = TRUE)##击球数量
  ))

## # A tibble: 18,915 x 3
##    playerID      ba    ab
##    <chr>      <dbl> <int>
##  1 aardsda01 0          4
##  2 aaronha01 0.305  12364
##  3 aaronto01 0.229    944
##  4 aasedo01  0          5
##  5 abadan01  0.0952    21
##  6 abadfe01  0.111      9
##  7 abadijo01 0.224     49
##  8 abbated01 0.254   3044
##  9 abbeybe01 0.169    225
## 10 abbeych01 0.281   1751
## # ... with 18,905 more rows

##融合gglot的工作流
batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

R-data-science-4

Chard Liu

2019年1月15日

R基础

使用dplyr进行数据转换

使用新的数据内置数据集，ncflights13

arrange()函数，不是选择行，而是改变行的顺序

接受参数为数据框，排序依据的列名或多个

select()函数选择列，找出自己感兴趣的变量子集

mutate()函数添加新变量，现有数据集已外的变量

使用summarize()进行分组摘要-折叠数据框的作用

使用管道组合多种操作

计数n(), sum(!is_na)非缺失值计数