R基础
#Sys.setlocale('LC_ALL','C')
##如遇到+号可继续输入或ESC退出输入
##小技巧,连续执行赋值语句并输出结果到屏幕
(y <- seq(1, 10, length.out = 5))#简化操作
## [1] 1.00 3.25 5.50 7.75 10.00
##在编程时,即便一点细微的差别也会导致程序无法继续运行
##Alt+Shift+K组合键显示快捷键查询表
使用dplyr进行数据转换
使用新的数据内置数据集,ncflights13
library(nycflights13)
library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.0 √ purrr 0.2.5
## √ tibble 1.4.2 √ dplyr 0.7.8
## √ tidyr 0.8.2 √ stringr 1.3.1
## √ readr 1.3.1 √ forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
dim(nycflights13::flights)
## [1] 336776 19
head(nycflights13::flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##flights数据集是一个tibble,也是一种数据框,格式
#int表示整数,dbl实数,chr字符向量,dttm日期时间,lgl逻辑变量,fctr因子,date日期型
##dplyr基础-5个核心函数
#filter按值筛选观测
#arrange对行重新排序
#select按名称选取变量
#mutate使用现有变量的函数创建新变量
#多个值总结为一个摘要统计量
#可与group_by函数联用,可与将整个数据集的操作变为在每个分组进行
#工作方式:参数1:数据框,参数2:不带引号的变量,输出结果:数据框
#使用filter进行筛选
filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 832 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#但dplyr不修改输入,需要自己赋值保存变量,要么输出结果
jan1<-filter(flights,month==1,day==1)
#小技巧,同时完成输出结果并保存,用括号包裹
(dec25 <- filter(flights, month == 12, day == 25))
## # A tibble: 719 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 25 456 500 -4 649
## 2 2013 12 25 524 515 9 805
## 3 2013 12 25 542 540 2 832
## 4 2013 12 25 546 550 -4 1022
## 5 2013 12 25 556 600 -4 730
## 6 2013 12 25 557 600 -3 743
## 7 2013 12 25 557 600 -3 818
## 8 2013 12 25 559 600 -1 855
## 9 2013 12 25 559 600 -1 849
## 10 2013 12 25 600 600 0 850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##比较运算符
# > >= < <= != ==
#比较浮点数相等时,应该使用near,而不是==
#如下
1/49*49==49##得到FALSE,计算机使用的是有限精度运算
## [1] FALSE
##易错点11或12月份的表达,应该是两次month
filter(flights,month==11|month==12)
## # A tibble: 55,403 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 11 1 5 2359 6 352
## 2 2013 11 1 35 2250 105 123
## 3 2013 11 1 455 500 -5 641
## 4 2013 11 1 539 545 -6 856
## 5 2013 11 1 542 545 -3 831
## 6 2013 11 1 549 600 -11 912
## 7 2013 11 1 550 600 -10 705
## 8 2013 11 1 554 600 -6 659
## 9 2013 11 1 554 600 -6 826
## 10 2013 11 1 554 600 -6 749
## # ... with 55,393 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#以下代码表示的是找出1月份的(易错)
filter(flights,month==11|12)##此写法错误,代码中11|12逻辑值是TURE,TURE在数字语境中就是1
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##有用的小技巧x %in% y 找出x是y中的一个值时的所有行
nov_dec<-filter(flights,month %in% c(11,12))
##缺失值NA(not available)可传染的
#is.na()函数,确定一个值是否为NA
##filter只能筛选出条件为TRUE的行,FALSE与NA排除
(df<-tibble(x=c(1,NA,3)))
## # A tibble: 3 x 1
## x
## <dbl>
## 1 1
## 2 NA
## 3 3
filter(df,x>1)#筛选大于1的变量
## # A tibble: 1 x 1
## x
## <dbl>
## 1 3
filter(df,is.na(x)|x>1)#保留NA
## # A tibble: 2 x 1
## x
## <dbl>
## 1 NA
## 2 3
arrange()函数,不是选择行,而是改变行的顺序
接受参数为数据框,排序依据的列名或多个
arrange(flights, year, month, day)#默认升序,根据年月日排序
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#desc()设置降序
arrange(flights, desc(dep_delay))#
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#缺失值总是排在最后
df <- tibble(x = c(5, 2, NA))
arrange(df, x)
## # A tibble: 3 x 1
## x
## <dbl>
## 1 2
## 2 5
## 3 NA
arrange(df, desc(x))##设置降序仍然缺失值排在最后
## # A tibble: 3 x 1
## x
## <dbl>
## 1 5
## 2 2
## 3 NA
##如何将缺失值排在最前面
arrange(df,desc(is.na(x)))#
## # A tibble: 3 x 1
## x
## <dbl>
## 1 NA
## 2 5
## 3 2
select()函数选择列,找出自己感兴趣的变量子集
select(flights,year,month,day)#选取年月日三个变量
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
#选择year与day之间的所有列
select(flights,year:day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
#选择不在year与day之间的所有列
select(flights,-(year:day))
## # A tibble: 336,776 x 16
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <dbl> <int> <int> <dbl>
## 1 517 515 2 830 819 11
## 2 533 529 4 850 830 20
## 3 542 540 2 923 850 33
## 4 544 545 -1 1004 1022 -18
## 5 554 600 -6 812 837 -25
## 6 554 558 -4 740 728 12
## 7 555 600 -5 913 854 19
## 8 557 600 -3 709 723 -14
## 9 557 600 -3 838 846 -8
## 10 558 600 -2 753 745 8
## # ... with 336,766 more rows, and 10 more variables: carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
##select函数中的一些辅助函数
#starts_with("abc"):匹配以abc开头的名称
#ends_with("xyz"):匹配以xyz结尾的名称
#contains("ijk"):匹配包含ijk
#matches("(.)\\1"):匹配正则表达式(这里表示的是重复字符)
#num_range("x",1:3):匹配x1,x2,和x3
##rename()函数重命名变量
rename(flights,tail_num=tailnum)#重命名变量
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tail_num <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##移动变量在数据框中的位置,everything函数
select(flights, time_hour, air_time, everything())#移动time_hour变量到开头
## # A tibble: 336,776 x 19
## time_hour air_time year month day dep_time sched_dep_time
## <dttm> <dbl> <int> <int> <int> <int> <int>
## 1 2013-01-01 05:00:00 227 2013 1 1 517 515
## 2 2013-01-01 05:00:00 227 2013 1 1 533 529
## 3 2013-01-01 05:00:00 160 2013 1 1 542 540
## 4 2013-01-01 05:00:00 183 2013 1 1 544 545
## 5 2013-01-01 06:00:00 116 2013 1 1 554 600
## 6 2013-01-01 05:00:00 150 2013 1 1 554 558
## 7 2013-01-01 06:00:00 158 2013 1 1 555 600
## 8 2013-01-01 06:00:00 53 2013 1 1 557 600
## 9 2013-01-01 06:00:00 140 2013 1 1 557 600
## 10 2013-01-01 06:00:00 138 2013 1 1 558 600
## # ... with 336,766 more rows, and 12 more variables: dep_delay <dbl>,
## # arr_time <int>, sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## # hour <dbl>, minute <dbl>
##练习
select(flights,air_time,air_time)
## # A tibble: 336,776 x 1
## air_time
## <dbl>
## 1 227
## 2 227
## 3 160
## 4 183
## 5 116
## 6 150
## 7 158
## 8 53
## 9 140
## 10 138
## # ... with 336,766 more rows
select(flights,contains("TIME"))#
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <dbl>
## 1 517 515 830 819 227
## 2 533 529 850 830 227
## 3 542 540 923 850 160
## 4 544 545 1004 1022 183
## 5 554 600 812 837 116
## 6 554 558 740 728 150
## 7 555 600 913 854 158
## 8 557 600 709 723 53
## 9 557 600 838 846 140
## 10 558 600 753 745 138
## # ... with 336,766 more rows, and 1 more variable: time_hour <dttm>
mutate()函数添加新变量,现有数据集已外的变量
##先选取一个子集
(flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
))
## # A tibble: 336,776 x 7
## year month day dep_delay arr_delay distance air_time
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227
## 2 2013 1 1 4 20 1416 227
## 3 2013 1 1 2 33 1089 160
## 4 2013 1 1 -1 -18 1576 183
## 5 2013 1 1 -6 -25 762 116
## 6 2013 1 1 -4 12 719 150
## 7 2013 1 1 -5 19 1065 158
## 8 2013 1 1 -3 -14 229 53
## 9 2013 1 1 -3 -8 944 140
## 10 2013 1 1 -2 8 733 138
## # ... with 336,766 more rows
##添加新变量gain,speed-默认添加到数据集的最后
mutate(flights_sml,
gain = dep_delay - arr_delay,
speed = distance / air_time * 60
)
## # A tibble: 336,776 x 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 -9 370.
## 2 2013 1 1 4 20 1416 227 -16 374.
## 3 2013 1 1 2 33 1089 160 -31 408.
## 4 2013 1 1 -1 -18 1576 183 17 517.
## 5 2013 1 1 -6 -25 762 116 19 394.
## 6 2013 1 1 -4 12 719 150 -16 288.
## 7 2013 1 1 -5 19 1065 158 -24 404.
## 8 2013 1 1 -3 -14 229 53 11 259.
## 9 2013 1 1 -3 -8 944 140 5 405.
## 10 2013 1 1 -2 8 733 138 -10 319.
## # ... with 336,766 more rows
##新变量一旦创建就可以使用
mutate(flights_sml,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 10
## year month day dep_delay arr_delay distance air_time gain hours
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 -9 3.78
## 2 2013 1 1 4 20 1416 227 -16 3.78
## 3 2013 1 1 2 33 1089 160 -31 2.67
## 4 2013 1 1 -1 -18 1576 183 17 3.05
## 5 2013 1 1 -6 -25 762 116 19 1.93
## 6 2013 1 1 -4 12 719 150 -16 2.5
## 7 2013 1 1 -5 19 1065 158 -24 2.63
## 8 2013 1 1 -3 -14 229 53 11 0.883
## 9 2013 1 1 -3 -8 944 140 5 2.33
## 10 2013 1 1 -2 8 733 138 -10 2.3
## # ... with 336,766 more rows, and 1 more variable: gain_per_hour <dbl>
##如果只想保留新变量使用transmute()函数
transmute(flights,
gain = dep_delay - arr_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 x 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 -9 3.78 -2.38
## 2 -16 3.78 -4.23
## 3 -31 2.67 -11.6
## 4 17 3.05 5.57
## 5 19 1.93 9.83
## 6 -16 2.5 -6.4
## 7 -24 2.63 -9.11
## 8 11 0.883 12.5
## 9 5 2.33 2.14
## 10 -10 2.3 -4.35
## # ... with 336,766 more rows
##模运算符%/%整数除法, %%求余
##lead(),lag()函数
(x <- 1:10)
## [1] 1 2 3 4 5 6 7 8 9 10
lead(x)
## [1] 2 3 4 5 6 7 8 9 10 NA
lag(x)
## [1] NA 1 2 3 4 5 6 7 8 9
x-lag(x)#序列移动差值
## [1] NA 1 1 1 1 1 1 1 1 1
(x!=lag(x))#序列何时发生变化
## [1] NA TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##R中的累加和,累加积,累加最小值,累加最大值,dplyr提供累加均值
x
## [1] 1 2 3 4 5 6 7 8 9 10
cumsum(x)#依次累加
## [1] 1 3 6 10 15 21 28 36 45 55
cummean(x)#依次累加求均值
## [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
##排序
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)#默认升序
## [1] 1 2 2 NA 4 5
min_rank(desc(y))#desc降序
## [1] 5 3 3 NA 2 1
row_number(y)#返回排序的位置
## [1] 1 2 3 NA 4 5
使用summarize()进行分组摘要-折叠数据框的作用
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
#如果不与group联用,则没有突出效果-分组折叠的强大功能
(by_day <- group_by(flights, year, month, day))#按年月日分组
## # A tibble: 336,776 x 19
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(summarise(by_day, delay = mean(dep_delay, na.rm = TRUE)))#折叠效果就此体现
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
使用管道组合多种操作
##按dest变量分组,不使用管道组合
(by_dest <- group_by(flights, dest))
## # A tibble: 336,776 x 19
## # Groups: dest [105]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
(delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
))
## # A tibble: 105 x 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ANC 8 3370 -2.5
## 5 ATL 17215 757. 11.3
## 6 AUS 2439 1514. 6.02
## 7 AVL 275 584. 8.00
## 8 BDL 443 116 7.05
## 9 BGR 375 378 8.03
## 10 BHM 297 866. 16.9
## # ... with 95 more rows
(delay <- filter(delay, count > 20, dest != "HNL"))#对行操作
## # A tibble: 96 x 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00
## 7 BDL 443 116 7.05
## 8 BGR 375 378 8.03
## 9 BHM 297 866. 16.9
## 10 BNA 6333 758. 11.8
## # ... with 86 more rows
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)#se阴影带
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##以上的代码较复杂,因为要对中间的变量命名,影响分析效率
##使用管道组合 %>%
delays <- flights %>%
group_by(dest) %>% ##本该重新命名新变量
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>% ##本该命名新变量,管道解决
filter(count > 20, dest != "HNL")
## %>%可读作然后,重点在于转换的过程而不是对象
##自己的理解,是一种将变量融合进当前对象的思想
##如 x %% f(y),转换为f(x,y)
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
##ggplo2暂不支持管道操作
##缺失值一般原则,输入有缺失,输出也有na.rm参数计算前移除缺失
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))##得到很多NA
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 NA
## 2 2013 1 2 NA
## 3 2013 1 3 NA
## 4 2013 1 4 NA
## 5 2013 1 5 NA
## 6 2013 1 6 NA
## 7 2013 1 7 NA
## 8 2013 1 8 NA
## 9 2013 1 9 NA
## 10 2013 1 10 NA
## # ... with 355 more rows
##
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))#先移除
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
##
not_cancelled<-flights %>%
filter(!is.na(dep_delay),!is.na(arr_delay))
(not_cancelled %>%
group_by(year,month,day) %>%
summarize(mean=mean(dep_delay)))
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.4
## 2 2013 1 2 13.7
## 3 2013 1 3 10.9
## 4 2013 1 4 8.97
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.56
## 9 2013 1 9 2.30
## 10 2013 1 10 2.84
## # ... with 355 more rows
计数n(), sum(!is_na)非缺失值计数
##根据tailnum分组,计算延误时间均值
(delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
))
## # A tibble: 4,037 x 2
## tailnum delay
## <chr> <dbl>
## 1 D942DN 31.5
## 2 N0EGMQ 9.98
## 3 N10156 12.7
## 4 N102UW 2.94
## 5 N103US -6.93
## 6 N104UW 1.80
## 7 N10575 20.7
## 8 N105UW -0.267
## 9 N107US -5.73
## 10 N108UW -1.25
## # ... with 4,027 more rows
##delay映射到x
ggplot(data = delays, mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10)

##计数航班数量
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()##计数
)
delays
## # A tibble: 4,037 x 3
## tailnum delay n
## <chr> <dbl> <int>
## 1 D942DN 31.5 4
## 2 N0EGMQ 9.98 352
## 3 N10156 12.7 145
## 4 N102UW 2.94 48
## 5 N103US -6.93 46
## 6 N104UW 1.80 46
## 7 N10575 20.7 269
## 8 N105UW -0.267 45
## 9 N107US -5.73 41
## 10 N108UW -1.25 60
## # ... with 4,027 more rows
##将x变量对应x轴,y对应延误时间,散点图
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)

##将ggplot2集成到dplyr工作流流的方法要领 %>% 过渡到+ (重要技巧)
delays %>%
filter(n > 25) %>% #筛选行
ggplot(mapping = aes(x = n, y = delay)) + ##实际上将dplyr工作流内化到data参数
geom_point(alpha = 1/10)

#### Rstudio技巧:Ctrl+Shift+P组合将上次的代码再次从编辑器发送到Console
##换个数据集
dim(Lahman::Batting)
## [1] 102816 22
head(Lahman::Batting)
## playerID yearID stint teamID lgID G AB R H X2B X3B HR RBI SB CS BB
## 1 abercda01 1871 1 TRO NA 1 4 0 0 0 0 0 0 0 0 0
## 2 addybo01 1871 1 RC1 NA 25 118 30 32 6 0 0 13 8 1 4
## 3 allisar01 1871 1 CL1 NA 29 137 28 40 4 5 0 19 3 1 2
## 4 allisdo01 1871 1 WS3 NA 27 133 28 44 10 2 2 27 1 1 0
## 5 ansonca01 1871 1 RC1 NA 25 120 29 39 11 3 0 16 6 2 2
## 6 armstbo01 1871 1 FW1 NA 12 49 9 11 2 1 0 5 0 1 0
## SO IBB HBP SH SF GIDP
## 1 0 NA NA NA NA NA
## 2 0 NA NA NA NA NA
## 3 5 NA NA NA NA NA
## 4 2 NA NA NA NA NA
## 5 1 NA NA NA NA NA
## 6 1 NA NA NA NA NA
class(Lahman::Batting)##常规数据框
## [1] "data.frame"
##转换为tibble格式,输出更美观
(batting<-as_tibble(Lahman::Batting))
## # A tibble: 102,816 x 22
## playerID yearID stint teamID lgID G AB R H X2B X3B
## <chr> <int> <int> <fct> <fct> <int> <int> <int> <int> <int> <int>
## 1 abercda~ 1871 1 TRO NA 1 4 0 0 0 0
## 2 addybo01 1871 1 RC1 NA 25 118 30 32 6 0
## 3 allisar~ 1871 1 CL1 NA 29 137 28 40 4 5
## 4 allisdo~ 1871 1 WS3 NA 27 133 28 44 10 2
## 5 ansonca~ 1871 1 RC1 NA 25 120 29 39 11 3
## 6 armstbo~ 1871 1 FW1 NA 12 49 9 11 2 1
## 7 barkeal~ 1871 1 RC1 NA 1 4 0 1 0 0
## 8 barnero~ 1871 1 BS1 NA 31 157 66 63 10 9
## 9 barrebi~ 1871 1 FW1 NA 1 5 1 1 1 0
## 10 barrofr~ 1871 1 BS1 NA 18 86 13 13 2 1
## # ... with 102,806 more rows, and 11 more variables: HR <int>, RBI <int>,
## # SB <int>, CS <int>, BB <int>, SO <int>, IBB <int>, HBP <int>,
## # SH <int>, SF <int>, GIDP <int>
(batters <- batting %>%
group_by(playerID) %>%
summarise(
ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),##能力
ab = sum(AB, na.rm = TRUE)##击球数量
))
## # A tibble: 18,915 x 3
## playerID ba ab
## <chr> <dbl> <int>
## 1 aardsda01 0 4
## 2 aaronha01 0.305 12364
## 3 aaronto01 0.229 944
## 4 aasedo01 0 5
## 5 abadan01 0.0952 21
## 6 abadfe01 0.111 9
## 7 abadijo01 0.224 49
## 8 abbated01 0.254 3044
## 9 abbeybe01 0.169 225
## 10 abbeych01 0.281 1751
## # ... with 18,905 more rows
##融合gglot的工作流
batters %>%
filter(ab > 100) %>%
ggplot(mapping = aes(x = ab, y = ba)) +
geom_point() +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
