This report is a summary of lesson by Colin Fay, Data camp
keep(.x, .p): extract elements that satisfy a
conditiondiscard(.x, .p): remove elements that satisfy a
conditionall_visits <- list(c(117, 147, 131, 73, 81, 134, 121),
c(180, 193, 116, 166, 131, 153, 146),
c(57, 110, 68, 72, 87, 141, 67))
day <- c("mon", "tue", "wed", "thu", "fri", "sat", "sun")
all_visits_named <- map(all_visits, ~ set_names(.x, day))
# Create a mapper that will test if .x is over 100
threshold <- as_mapper( ~.x > 100)
# Run this mapper on the all_visits_named object: group_over
map(all_visits_named, ~ keep(.x, threshold))
## [[1]]
## mon tue wed sat sun
## 117 147 131 134 121
##
## [[2]]
## mon tue wed thu fri sat sun
## 180 193 116 166 131 153 146
##
## [[3]]
## tue sat
## 110 141
# Run this mapper on the all_visits_named object: group_under
map(all_visits_named, ~ discard(.x, threshold))
## [[1]]
## thu fri
## 73 81
##
## [[2]]
## named numeric(0)
##
## [[3]]
## mon wed thu fri sun
## 57 68 72 87 67
Predicates: return TRUE or FALSE
keep, discard의 .p argument에는
predicate functional을 사용해야 한다.
every(): does every element satisfy a
condition?some(): do some elements satisfy a
condition?detect_index(.right = T/F): .p를 만족하는
첫 번째 element index 반환
.right: 마지막 element index 반환.dir = "backward": 동일detect(): index가 아닌 value 반환has_element(): 특정 요소가 포함됐는지 반환# Create a threshold variable, set it to 160
threshold <- 160
# Create a mapper that tests if .x is over the defined threshold
over_threshold <- as_mapper(~ .x > threshold)
# Are all elements in every all_visits vectors over the defined threshold?
map(all_visits, ~ every(.x, over_threshold))
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] FALSE
##
## [[3]]
## [1] FALSE
# Are some elements in every all_visits vectors over the defined threshold?
map(all_visits, ~ some(.x, over_threshold))
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] FALSE
To unerstand computations in R, two slogans are helpful:
- Everything that exists in an object
- Everything that happens is a function call. - John Chambers
R에서 존재하는 모든 것들은 object로 name과 value를 가지고 있기에 funtion도 object에 해당한다. R에서 존재하는 모든 것들이 function이다.
ex) sum, mean….
반대로 Sys.time()등은 impure function이다. 언제
실행하는지에 따라 값이 달라지기 때문이다.
safe_log <- safely(log)
safe_log(1)
## $result
## [1] 0
##
## $error
## NULL
safe_log("a")
## $result
## NULL
##
## $error
## <simpleError in .Primitive("log")(x, base): 수학함수에 숫자가 아닌 인자가 전달되었습니다>
possible_sum <- possibly(sum, otherwise = "nop")
possible_sum(1)
## [1] 1
possible_sum("a")
## [1] "nop"
urls <- c(
"https://thinkr.fr", "https://colinfay.me",
"https://not_working.org", "https://en.wikipedia.org",
"https://www.datacamp.com", "https://not_working_either.org")
url_tester <- function(url_list) {
url_list %>%
# Map a version of read_lines() that otherwise returns 404
map(possibly(read_lines, otherwise = 404)) %>%
# Set the names of the result
set_names( urls ) %>%
# paste() and collapse each element
map(paste, collapse = " ") %>%
# Remove the 404
discard(~.x == 404) %>%
# Will return the names of the good ones
names()
}
url_tester(urls)
## [1] "https://thinkr.fr" "https://colinfay.me"
## [3] "https://en.wikipedia.org"
compact(): removes the NULL
possibly의 otherwise argument 내 사용해서
NULL을 제거l <- list(1, 2, 3, "a")
possible_log <- possibly(log, otherwise = NULL)
map(l, possible_log) %>% compact()
## [[1]]
## [1] 0
##
## [[2]]
## [1] 0.6931472
##
## [[3]]
## [1] 1.098612
flatten(): removes one levelmy_list <- list(
list(a = 1),
list(b = 2)
)
my_list
## [[1]]
## [[1]]$a
## [1] 1
##
##
## [[2]]
## [[2]]$b
## [1] 2
flatten(my_list)
## $a
## [1] 1
##
## $b
## [1] 2
Clean code is: * Light * Readable * Interpretable * Maintainable
compose(): compose a new function from two other
functions
rounded_mean <- compose(round, mean)
rounded_mean(1:2811)
## [1] 1406
negate(): flip the logicalis_not_na <- negate(is.na)
x <- c(1, 2, 3, 4, NA)
is.na(x)
## [1] FALSE FALSE FALSE FALSE TRUE
is_not_na(x)
## [1] TRUE TRUE TRUE TRUE FALSE
partial(): prefill a function(미리 function argument를
설정해서 수정할 때 1번만 바꾸면 됨)rounded_mean <- compose(
partial(round, digits = 2),
partial(mean, na.rm = TRUE)
)
rounded_mean(airquality$Ozone)
## [1] 42.13
urls <- c("https://thinkr.fr", "https://colinfay.me", "https://en.wikipedia.org", "https://www.datacamp.com")
# Prefill html_nodes() with the css param set to h2
get_h2 <- partial(html_nodes, css = "h2")
# Combine the html_text, get_h2 and read_html functions
get_content <- compose(html_text, get_h2, read_html)
# Map get_content to the urls list
res <- map(urls, get_content) %>%
set_names(urls)
# Print the results to the console
res
## $`https://thinkr.fr`
## [1] "\n\t\t\t\t\t\t\t\t\t\t\t\t\t"
## [2] "\n\t\t\t\t\t\t\t\t\t\t\t\t\t"
## [3] "Nos formations Certifiantes à R sont finançables à 100% via le CPF"
## [4] "R niveau 3 – Développeur – Conception d’interfaces Shiny – Formation certifiante Mars 2025"
## [5] "Déboguer une fonction avec debugonce() ou browser()"
## [6] "\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tAfficher le numéro01 85 09 14 03\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\t"
## [7] "Des formateurs amouReux"
## [8] "Bénéficiez d'une formation sur-mesure pour vous et votre équipe"
## [9] "Les différents moyens de faire financer votre formation."
##
## $`https://colinfay.me`
## character(0)
##
## $`https://en.wikipedia.org`
## [1] "From today's featured article" "Did you know ..."
## [3] "In the news" "On this day"
## [5] "Today's featured picture" "Other areas of Wikipedia"
## [7] "Wikipedia's sister projects" "Wikipedia languages"
##
## $`https://www.datacamp.com`
## [1] "Create Your Free Account"
## [2] "Grow your data skills with DataCamp for Mobile"
## [3] "Empower your business with world-class data and AI skills."
## [4] "What is DataCamp?"
## [5] "We learn best by doing"
## [6] ".css-ou6dz6{color:#03ef62;}Hands-on interactive learning experience"
## [7] "Learning paths designed by experts"
## [8] "Land your dream job in data science"
## [9] "Build your data portfolio"
## [10] "Don't just take our word for it."
## [11] "Join more than .css-ou6dz6{color:#03ef62;}15 million learners worldwide"
## [12] "Grow your data skills with DataCamp for Mobile"
A data.frame with a list for a column
urls_2 <- tibble(urls = c("https://thinkr.fr", "https://colinfay.me", "https://en.wikipedia.org", "https://www.datacamp.com"))
get_h2 <- partial(html_nodes, css = "h2")
get_content <- compose(html_text, get_h2, read_html)
urls_2 %>% mutate(links = map(urls, get_content))
## # A tibble: 4 × 2
## urls links
## <chr> <list>
## 1 https://thinkr.fr <chr [9]>
## 2 https://colinfay.me <chr [0]>
## 3 https://en.wikipedia.org <chr [8]>
## 4 https://www.datacamp.com <chr [12]>
rstudioconfrstudioconf <- read_rds("./datasets/#RStudioConf.rds")
rstudioconf <- split(rstudioconf, seq(nrow(rstudioconf)))
length(rstudioconf)
## [1] 5055
length(rstudioconf[[1]])
## [1] 42
vec_depth(rstudioconf)
## [1] 4
# Create a sublist of non-retweets
non_rt <- discard(rstudioconf, "is_retweet")
# Extract the favorite count element of each non_rt sublist
fav_count <- map_dbl(non_rt, "favorite_count")
# Get the median of favorite_count for non_rt
median(fav_count)
## [1] 1
"is_retweet" 칼럼이 logical이기에 discard
사용 가능
union(x, y): 집합의 합집합setdiff(x, y): 집합의 차집합을 구하는 함수로 x에는
포함되지만 y에는 포함되지 않는 요소를 반환# Keep the RT, extract the user_id, remove the duplicate
rt <- keep(rstudioconf, "is_retweet") %>%
map_chr("user_id") %>%
unique()
# Remove the RT, extract the user id, remove the duplicate
non_rt <- discard(rstudioconf, "is_retweet") %>%
map_chr("user_id") %>%
unique()
# Determine the total number of users
union(rt, non_rt) %>% length()
## [1] 1742
# Determine the number of users who has just retweeted
setdiff(rt, non_rt) %>% length()
## [1] 1302
# Prefill mean() with na.rm, and round() with digits = 1
mean_na_rm <- partial(mean, na.rm = TRUE)
round_one <- partial(round, digits = 1)
# Compose a rounded_mean function
rounded_mean <- compose(round_one, mean_na_rm)
# Extract the non retweet
non_rt <- discard(rstudioconf, "is_retweet")
# Extract "favorite_count", and pass it to rounded_mean()
non_rt %>%
map_dbl("favorite_count") %>%
rounded_mean()
## [1] 3.3
# Combine as_vector(), compact(), and flatten()
flatten_to_vector <- compose(as_vector, compact, flatten)
# Complete the function
extractor <- function(list, what = "mentions_screen_name"){
map( list , what ) %>%
flatten_to_vector()
}
# Create six_most, with tail(), sort(), and table()
six_most <- compose(tail, sort, table)
# Run extractor() on rstudioconf
extractor(rstudioconf) %>%
six_most()
## .
## JennyBryan hadleywickham AmeliaMN juliasilge drob
## 278 308 362 376 418
## rstudio
## 648
stringr::str_detect(): pattern detection# # Extract the "urls_url" elements, and flatten() the result
# urls_clean <- map(rstudioconf, "urls_url") %>%
# flatten()
#
# # Remove the NULL
# compact_urls <- compact(urls_clean)
#
# # Create a mapper that detects the patten "github"
# has_github <- as_mapper(~ str_detect(.x, "github"))
# Look for the "github" pattern, and sum the result
# map_lgl( compact_urls, has_github ) %>%
# sum()
# # From previous step
# str_prop_detected <- function(vec, pattern) {
# vec %>%
# str_detect(pattern) %>%
# mean()
# }
# flatten_and_compact <- compose(compact, flatten)
#
# rstudioconf %>%
# # From each element, extract "urls_url"
# map("urls_url") %>%
# # Flatten and compact
# flatten_and_compact() %>%
# # Get proportion of URLs containing "github"
# str_prop_detected("github")
# 데이터 불일치로 구현은 생략
map_at(): a specific placemy_list <- list(
a = 1:10,
b = 1:100,
c = 12
)
map_at(.x = my_list, .at = "b", .f = sum)
## $a
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $b
## [1] 5050
##
## $c
## [1] 12
# Create mean_above, a mapper that tests if .x is over 3.3
mean_above <- as_mapper(~ .x > 3.3)
# Prefil map_at() with "retweet_count", mean_above for above,
# and mean_above negation for below
above <- partial(map_at, .at = "retweet_count", .f := mean_above )
below <- partial(map_at, .at = "retweet_count", .f := negate(mean_above))
# Map above() and below() on non_rt, keep the "retweet_count"
ab <- map(non_rt, above) %>% keep("retweet_count")
bl <- map(non_rt, below) %>% keep("retweet_count")
# Compare the size of both elements
length(ab)
## [1] 83
length(bl)
## [1] 1741
# Get the max() of "retweet_count"
max_rt <- map_int(non_rt, "retweet_count") %>%
max()
# Prefill map_at() with a mapper testing if .x equal max_rt
max_rt_calc <- partial(map_at, .at = "retweet_count", .f := ~ .x == max_rt)
res <- non_rt %>%
# Call max_rt_calc() on each element
map(max_rt_calc) %>%
# Keep elements where retweet_count is non-zero
keep("retweet_count") %>%
# Flatten it
flatten()
# Print the "screen_name" and "text" of the result
res$screen_name
## [1] "kearneymw"
res$text
## [1] "The week of #rstudioconf is a good time to remind everyone that some important books are [intentionally] available online for free:\nhttps://t.co/ePMiKs3MAr\nhttps://t.co/NHR7wmLGgd\nhttps://t.co/wbymwjG0CD\nhttps://t.co/uwqG0q967M\nhttps://t.co/AjXTfZgyAg\nhttps://t.co/zgoHq51PGV"