tidyfst是一个以data.table为后端的 tidy 数据操作动词工具包。
github :https://github.com/hope-data-science/tidyfst cheatsheet:https://github.com/hope-data-science/tidyfst/blob/master/docs/tidyfst_cheatsheet.pdf
如果有需要,可以直接使用datatable
#basic usage
library(tidyfst)
##
## Life's short, use R.
library(nycflights13)
library(data.table)
data.table(flights)
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <num> <int>
## 1: 2013 1 1 517 515 2 830
## 2: 2013 1 1 533 529 4 850
## 3: 2013 1 1 542 540 2 923
## 4: 2013 1 1 544 545 -1 1004
## 5: 2013 1 1 554 600 -6 812
## ---
## 336772: 2013 9 30 NA 1455 NA NA
## 336773: 2013 9 30 NA 2200 NA NA
## 336774: 2013 9 30 NA 1210 NA NA
## 336775: 2013 9 30 NA 1159 NA NA
## 336776: 2013 9 30 NA 840 NA NA
## 12 variables not shown: [sched_arr_time <int>, arr_delay <num>, carrier <char>, flight <int>, tailnum <char>, origin <char>, dest <char>, air_time <num>, distance <num>, hour <num>, ...]
filter_dt(flights, month == 1 & day == 1) # filter_dt(flights, month == 1,day == 1) 可能会返回错误
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <num> <int>
## 1: 2013 1 1 517 515 2 830
## 2: 2013 1 1 533 529 4 850
## 3: 2013 1 1 542 540 2 923
## 4: 2013 1 1 544 545 -1 1004
## 5: 2013 1 1 554 600 -6 812
## ---
## 838: 2013 1 1 2356 2359 -3 425
## 839: 2013 1 1 NA 1630 NA NA
## 840: 2013 1 1 NA 1935 NA NA
## 841: 2013 1 1 NA 1500 NA NA
## 842: 2013 1 1 NA 600 NA NA
## 12 variables not shown: [sched_arr_time <int>, arr_delay <num>, carrier <char>, flight <int>, tailnum <char>, origin <char>, dest <char>, air_time <num>, distance <num>, hour <num>, ...]
select_dt(flights, year, month, day)
## year month day
## <int> <int> <int>
## 1: 2013 1 1
## 2: 2013 1 1
## 3: 2013 1 1
## 4: 2013 1 1
## 5: 2013 1 1
## ---
## 336772: 2013 9 30
## 336773: 2013 9 30
## 336774: 2013 9 30
## 336775: 2013 9 30
## 336776: 2013 9 30
mutate_dt(flights,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <num> <int>
## 1: 2013 1 1 517 515 2 830
## 2: 2013 1 1 533 529 4 850
## 3: 2013 1 1 542 540 2 923
## 4: 2013 1 1 544 545 -1 1004
## 5: 2013 1 1 554 600 -6 812
## ---
## 336772: 2013 9 30 NA 1455 NA NA
## 336773: 2013 9 30 NA 2200 NA NA
## 336774: 2013 9 30 NA 1210 NA NA
## 336775: 2013 9 30 NA 1159 NA NA
## 336776: 2013 9 30 NA 840 NA NA
## 14 variables not shown: [sched_arr_time <int>, arr_delay <num>, carrier <char>, flight <int>, tailnum <char>, origin <char>, dest <char>, air_time <num>, distance <num>, hour <num>, ...]
transmute_dt(flights,
gain = arr_delay - dep_delay
) # 替换原有变量
## gain
## <num>
## 1: 9
## 2: 16
## 3: 31
## 4: -17
## 5: -19
## ---
## 336772: NA
## 336773: NA
## 336774: NA
## 336775: NA
## 336776: NA
summarise_dt(flights,
delay = mean(dep_delay, na.rm = TRUE)
)
## delay
## <num>
## 1: 12.63907
sample_n_dt(flights, 10)
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <num> <int>
## 1: 2013 3 30 937 940 -3 1221
## 2: 2013 9 26 1156 1200 -4 1441
## 3: 2013 4 19 922 930 -8 1027
## 4: 2013 6 25 1422 1350 32 1529
## 5: 2013 1 4 834 829 5 1134
## 6: 2013 6 16 634 630 4 856
## 7: 2013 5 16 2123 2130 -7 16
## 8: 2013 7 12 604 600 4 946
## 9: 2013 12 30 644 645 -1 917
## 10: 2013 12 17 353 1930 503 500
## 12 variables not shown: [sched_arr_time <int>, arr_delay <num>, carrier <char>, flight <int>, tailnum <char>, origin <char>, dest <char>, air_time <num>, distance <num>, hour <num>, ...]
sample_frac_dt(flights, 0.01)
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <num> <int>
## 1: 2013 3 31 1832 1835 -3 2023
## 2: 2013 8 29 1721 1725 -4 1949
## 3: 2013 12 14 NA 1500 NA NA
## 4: 2013 2 23 1104 1041 23 1324
## 5: 2013 12 16 1443 1429 14 1707
## ---
## 3363: 2013 10 25 1602 1600 2 1757
## 3364: 2013 8 18 1800 1800 0 2044
## 3365: 2013 7 28 1032 1039 -7 1221
## 3366: 2013 5 11 2035 2030 5 2158
## 3367: 2013 9 29 1225 1230 -5 1350
## 12 variables not shown: [sched_arr_time <int>, arr_delay <num>, carrier <char>, flight <int>, tailnum <char>, origin <char>, dest <char>, air_time <num>, distance <num>, hour <num>, ...]
flights %>%
summarise_dt( count = .N,
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE),by = tailnum)
## tailnum count dist delay
## <char> <int> <num> <num>
## 1: N14228 111 1546.964 3.711712
## 2: N24211 130 1330.262 7.700000
## 3: N619AA 24 1339.208 7.652174
## 4: N804JB 219 1424.621 -1.860465
## 5: N668DN 49 1027.592 2.625000
## ---
## 4040: N766SK 1 419.000 -24.000000
## 4041: N772SK 1 419.000 -8.000000
## 4042: N776SK 1 419.000 -18.000000
## 4043: N785SK 1 419.000 -16.000000
## 4044: N557AS 1 2402.000 -30.000000
# 多变量
flights %>%
summarise_dt(by = .(year,month,day),flights = .N)
## year month day flights
## <int> <int> <int> <int>
## 1: 2013 1 1 842
## 2: 2013 1 2 943
## 3: 2013 1 3 914
## 4: 2013 1 4 915
## 5: 2013 1 5 720
## ---
## 361: 2013 9 26 996
## 362: 2013 9 27 996
## 363: 2013 9 28 682
## 364: 2013 9 29 914
## 365: 2013 9 30 993
library(tidyfst)
library(nycflights13)
flights2 <- flights %>%
select_dt(year,month,day, hour, origin, dest, tailnum, carrier)
flights2 %>%
left_join_dt(airlines)
## Joining by: carrier
## Key: <carrier>
## carrier year month day hour origin dest tailnum
## <char> <int> <int> <int> <num> <char> <char> <char>
## 1: 9E 2013 1 1 8 JFK MSP N915XJ
## 2: 9E 2013 1 1 15 JFK IAD N8444F
## 3: 9E 2013 1 1 14 JFK BUF N920XJ
## 4: 9E 2013 1 1 15 JFK SYR N8409N
## 5: 9E 2013 1 1 15 JFK ROC N8631E
## ---
## 336772: YV 2013 9 29 16 LGA IAD N518LR
## 336773: YV 2013 9 29 17 LGA CLT N932LR
## 336774: YV 2013 9 30 16 LGA IAD N510MJ
## 336775: YV 2013 9 30 17 LGA CLT N905FJ
## 336776: YV 2013 9 30 20 LGA CLT N924FJ
## 1 variable not shown: [name <char>]
flights2 %>% left_join_dt(planes, by = "tailnum")
## Key: <tailnum>
## tailnum year.x month day hour origin dest carrier year.y type
## <char> <int> <int> <int> <num> <char> <char> <char> <int> <char>
## 1: <NA> 2013 1 2 15 JFK LAX AA NA <NA>
## 2: <NA> 2013 1 2 16 EWR ORD UA NA <NA>
## 3: <NA> 2013 1 3 8 EWR MIA UA NA <NA>
## 4: <NA> 2013 1 3 6 EWR DFW UA NA <NA>
## 5: <NA> 2013 1 4 8 JFK DCA 9E NA <NA>
## ---
## 336772: N9EAMQ 2013 9 27 16 LGA ATL MQ NA <NA>
## 336773: N9EAMQ 2013 9 29 12 LGA BNA MQ NA <NA>
## 336774: N9EAMQ 2013 9 29 18 LGA CMH MQ NA <NA>
## 336775: N9EAMQ 2013 9 30 11 JFK DCA MQ NA <NA>
## 336776: N9EAMQ 2013 9 30 14 JFK TPA MQ NA <NA>
## 6 variables not shown: [manufacturer <char>, model <char>, engines <int>, seats <int>, speed <int>, engine <char>]
flights2 %>% left_join_dt(airports, c("dest" = "faa"))
## Key: <dest>
## dest year month day hour origin tailnum carrier
## <char> <int> <int> <int> <num> <char> <char> <char>
## 1: ABQ 2013 10 1 20 JFK N554JB B6
## 2: ABQ 2013 10 2 20 JFK N607JB B6
## 3: ABQ 2013 10 3 20 JFK N591JB B6
## 4: ABQ 2013 10 4 20 JFK N662JB B6
## 5: ABQ 2013 10 5 19 JFK N580JB B6
## ---
## 336772: XNA 2013 9 29 17 LGA N725MQ MQ
## 336773: XNA 2013 9 30 7 LGA N735MQ MQ
## 336774: XNA 2013 9 30 8 EWR N14117 EV
## 336775: XNA 2013 9 30 15 LGA N725MQ MQ
## 336776: XNA 2013 9 30 17 LGA N720MQ MQ
## 7 variables not shown: [name <char>, lat <num>, lon <num>, alt <num>, tz <num>, dst <char>, tzone <char>]
df1 <- data.table(x = c(1, 2), y = 2:1)
df2 <- data.table(x = c(1, 3), a = 10, b = "a")
df1 %>% inner_join_dt(df2)
df1 %>% left_join_dt(df2)
df1 %>% right_join_dt(df2)
df1 %>% full_join_dt(df2)
flights %>%
anti_join_dt(planes, by = "tailnum") %>%
count_dt(tailnum, sort = TRUE)
## tailnum n
## <char> <int>
## 1: <NA> 2512
## 2: N725MQ 575
## 3: N722MQ 513
## 4: N723MQ 507
## 5: N713MQ 483
## ---
## 718: N7BKAA 1
## 719: N7CAAA 1
## 720: N5FCAA 1
## 721: N5ERAA 1
## 722: N647MQ 1
library(tidyfst)
library(tidyr)
relig_income
## # A tibble: 18 × 11
## religion `<$10k` $10-2…¹ $20-3…² $30-4…³ $40-5…⁴ $50-7…⁵ $75-1…⁶ $100-…⁷
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agnostic 27 34 60 81 76 137 122 109
## 2 Atheist 12 27 37 52 35 70 73 59
## 3 Buddhist 27 21 30 34 33 58 62 39
## 4 Catholic 418 617 732 670 638 1116 949 792
## 5 Don’t know/r… 15 14 15 11 10 35 21 17
## 6 Evangelical … 575 869 1064 982 881 1486 949 723
## 7 Hindu 1 9 7 9 11 34 47 48
## 8 Historically… 228 244 236 238 197 223 131 81
## 9 Jehovah's Wi… 20 27 24 24 21 30 15 11
## 10 Jewish 19 19 25 25 30 95 69 87
## 11 Mainline Prot 289 495 619 655 651 1107 939 753
## 12 Mormon 29 40 48 51 56 112 85 49
## 13 Muslim 6 7 9 10 9 23 16 8
## 14 Orthodox 13 17 23 32 32 47 38 42
## 15 Other Christ… 9 7 11 13 13 14 18 14
## 16 Other Faiths 20 33 40 46 49 63 46 40
## 17 Other World … 5 2 3 4 2 7 3 4
## 18 Unaffiliated 217 299 374 365 341 528 407 321
## # … with 2 more variables: `>150k` <dbl>, `Don't know/refused` <dbl>, and
## # abbreviated variable names ¹`$10-20k`, ²`$20-30k`, ³`$30-40k`, ⁴`$40-50k`,
## # ⁵`$50-75k`, ⁶`$75-100k`, ⁷`$100-150k`
relig_income %>%
longer_dt("religion",name = "income",value = "count")
## religion income count
## <char> <fctr> <num>
## 1: Agnostic <$10k 27
## 2: Atheist <$10k 12
## 3: Buddhist <$10k 27
## 4: Catholic <$10k 418
## 5: Don’t know/refused <$10k 15
## ---
## 176: Orthodox Don't know/refused 73
## 177: Other Christian Don't know/refused 18
## 178: Other Faiths Don't know/refused 71
## 179: Other World Religions Don't know/refused 8
## 180: Unaffiliated Don't know/refused 597
fish_encounters %>%
wider_dt(name = "station",value = "seen")
## Key: <fish>
## fish Release I80_1 Lisbon Rstr Base_TD BCE BCW BCE2 BCW2
## <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1: 4842 1 1 1 1 1 1 1 1 1
## 2: 4843 1 1 1 1 1 1 1 1 1
## 3: 4844 1 1 1 1 1 1 1 1 1
## 4: 4845 1 1 1 1 1 NA NA NA NA
## 5: 4847 1 1 1 NA NA NA NA NA NA
## 6: 4848 1 1 1 1 NA NA NA NA NA
## 7: 4849 1 1 NA NA NA NA NA NA NA
## 8: 4850 1 1 NA 1 1 1 1 NA NA
## 9: 4851 1 1 NA NA NA NA NA NA NA
## 10: 4854 1 1 NA NA NA NA NA NA NA
## 11: 4855 1 1 1 1 1 NA NA NA NA
## 12: 4857 1 1 1 1 1 1 1 1 1
## 13: 4858 1 1 1 1 1 1 1 1 1
## 14: 4859 1 1 1 1 1 NA NA NA NA
## 15: 4861 1 1 1 1 1 1 1 1 1
## 16: 4862 1 1 1 1 1 1 1 1 1
## 17: 4863 1 1 NA NA NA NA NA NA NA
## 18: 4864 1 1 NA NA NA NA NA NA NA
## 19: 4865 1 1 1 NA NA NA NA NA NA
## 2 variables not shown: [MAE <int>, MAW <int>]
嵌套
library(tidyfst)
# nest by "cyl" column
mtcars_nested <- mtcars %>%
nest_dt(cyl) # you can use "cyl" too, very flexible
# inspect the output data.table
mtcars_nested
## cyl ndt
## <num> <list>
## 1: 6 <data.table[7x10]>
## 2: 4 <data.table[11x10]>
## 3: 8 <data.table[14x10]>
根据嵌套进行regression
mtcars_nested2 <- mtcars_nested %>%
mutate_dt(model = lapply(ndt,function(df) lm(mpg ~ wt, data = df)))
mtcars_nested2
## cyl ndt model
## <num> <list> <list>
## 1: 6 <data.table[7x10]> <lm[12]>
## 2: 4 <data.table[11x10]> <lm[12]>
## 3: 8 <data.table[14x10]> <lm[12]>
永远不需要将整个 data.frame 读入 RAM,需选择目标数据,立即处理它们并一次获得所有结果。