May 29, 2018
Niels Ole Dam
Physics and Communications at Roskilde University
Previously:
Since 2014:
Specialties:
Agile Workflows, Personal Productivity, Data Wrangling, Meeting Facilitations, Process Analysis and Visualization, Process Mining, Data Analysis
# The problem df <- data.frame(xyz = "a") df$x class(df$xyz) as.numeric(df$xyz) # The Tidyverse solution library(dplyr) df <- data_frame(xyz = "a") df$x class(df$xyz) as.numeric(df$xyz)
For me:
A subset of data from the World Health Organization Global Tuberculosis Report, and accompanying global populations.
library(tidyverse) who
df = pd.melt(df, id_vars=["country","year"], value_name="cases", var_name="sex_and_age")
# Extract Sex, Age lower bound and Age upper bound group
tmp_df = df["sex_and_age"].str.extract("(\D)(\d+)(\d{2})")
# Name columns
tmp_df.columns = ["sex", "age_lower", "age_upper"]
# Create `age`column based on `age_lower` and `age_upper`
tmp_df["age"] = tmp_df["age_lower"] + "-" + tmp_df["age_upper"]
# Merge
df = pd.concat([df, tmp_df], axis=1)
# Drop unnecessary columns and rows
df = df.drop(['sex_and_age',"age_lower","age_upper"], axis=1)
df = df.dropna()
df = df.sort(ascending=True,columns=["country", "year", "sex", "age"])
df.head(10)
library(tidyverse)
who_tidy <- who %>%
setNames(gsub("newrel", "new_rel", names(.))) %>%
gather("code", "value", 5:60) %>%
separate(code, c("new", "var", "sexage")) %>%
separate(sexage, c("sex", "age"), sep = 1) %>%
spread(var, value) %>%
select(-iso2, -iso3)
who
who_tidy
library(dplyr); library(rvest); library(jsonlite)
url <- "https://www.tidyverse.org/packages/"
tidy_packages <- url %>%
read_html() %>%
html_nodes("a") %>%
html_text() %>%
unique()
tidy_packages <- c(tidy_packages,
"tidyverse", "dbplyr", "tidyselect", "plyr", "lazyeval")
cran_top_100 <- fromJSON("https://cranlogs.r-pkg.org/top/last-month/100")$downloads
cran_top_100$Tidyverse <- cran_top_100$package %in% tidy_packages
cran_top_100 <- cran_top_100 %>%
rownames_to_column() %>%
rename(rank = rowname)
cran_top_100 %>%
filter(Tidyverse == T) %>%
select(-Tidyverse)
rank package downloads
4 stringr 503705
7 ggplot2 439296
8 tibble 425023
11 dplyr 398653
12 glue 390245
18 magrittr 347332
23 plyr 336000
25 jsonlite 308499
26 lazyeval 308343
37 purrr 260274
39 readxl 254079
40 tidyr 253041
44 lubridate 246618
45 tidyselect 243500
47 readr 238927
48 hms 238774
54 httr 224839
60 DBI 202635
66 haven 187368
69 forcats 181827
77 xml2 156151
78 broom 155492
library(scales)
tidyverse_pct <- cran_top_100 %>%
group_by(Tidyverse) %>%
summarise(total = sum(as.numeric(downloads))) %>%
mutate(total = percent(total / sum(total)))
Tidyverse total
FALSE 75.4%
TRUE 24.6%
[ Demo ]
Tip: readr is integrated into RStudios file navigation
x1 <- list(c(1, 2), c(3, 4)) x2 <- list(list(1, 2), list(3, 4)) x3 <- list(1, list(2, list(3)))
[ Demo ]
View(outputTables[["INITIALIZE"]]) View(outputTables[["STOP"]])
library(httr)
library(jsonlite)
spotifyOAuth <- function(app_id, client_id, client_secret) {
spotifyR <- httr::oauth_endpoint(
authorize = "https://accounts.spotify.com/authorize",
access = "https://accounts.spotify.com/api/token")
myapp <- httr::oauth_app(app_id, client_id, client_secret)
return(httr::oauth2.0_token(spotifyR, myapp, scope = "playlist-modify-public"))
}
keys <- spotifyOAuth("roskilde-2017",
"your_client_id",
"your_client_secret")
searchArtist <- function(artistName) {
r <- httr::RETRY("GET", paste0("https://api.spotify.com/v1/search?q=",
gsub(' ', '+', artistName),"&type=artist&market=DK"),
times = 30)
req <- jsonlite::fromJSON(content(r, "text"))
if (!is.null(req$artists$total) && req$artists$total > 0) {
artist <- req$artists$items[,c("id", "name", "popularity", "genres", "type")]
artist$followers <- as.numeric(req$artists$items$followers$total)
return(artist)
} else {
return(NA)
}
}
sp_artists_raw <- lapply(rf_artists$encodedName, searchArtist)
# The problem
df <- data.frame(artists = c(artists, artists, artists, artists),
show_main_period = c(FALSE, TRUE, FALSE, TRUE),
schedule_name = c("scheduleUpcoming", "scheduleMain", "scheduleUpcomingWithURL",
"scheduleMainWithURL"))
# The Tidyverse solution
df <- tribble(
~artists, ~show_main_period, ~add_url, ~YEAR, ~sp_data, ~path, ~schedule_name,
artists, FALSE, FALSE, "2017", sp_artists, "data_out", "scheduleUpcoming",
artists, TRUE, FALSE, "2017", sp_artists, "data_out", "scheduleMain",
artists, FALSE, TRUE, "2017", sp_artists, "data_out", "scheduleUpcomingWithURL",
artists, TRUE, TRUE, "2017", sp_artists, "data_out", "scheduleMainWithURL")
[ Demo - Roskilde Festival ]
n <- list(1, 3, 5)
args1 <- list(n, mu, sigma)
args1 %>%
pmap(rnorm) %>%
str()
map
map_lgl
map_chr
map_int
map_dbl
map_df
walk
library(gapminder)
library(tidyr)
library(dplyr)
View(gapminder)
by_country <- gapminder %>%
mutate(year1950 = year - 1950) %>%
group_by(continent, country) %>%
nest()
View(by_country)
[ Gapminder Demo ]
library(purrr)
country_model <- function(df) {
lm(lifeExp ~ year1950, data = df)
}
by_country2 <- by_country %>%
mutate(model = map(data, country_model))
View(by_country2)
[ Demo - cont. ]
library(modelr)
by_country3 <- by_country2 %>%
mutate(resids = map2(data, model, add_residuals))
View(by_country3)
[ Demo - cont. ]
library(ggplot2)
unnest(by_country3, resids) %>%
ggplot(aes(year, resid, group = country)) +
geom_line(alpha = 1 / 3) + facet_wrap(~continent)
[ Demo - cont. ]
library(dplyr)
library(dbplyr)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
copy_to(con, nycflights13::flights, "flights",
temporary = FALSE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum",
"dest"
)
)
flights_db <- tbl(con, "flights")
tailnum_delay_db <- flights_db %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
) %>%
arrange(desc(delay)) %>%
filter(n > 100)
show_query(tailnum_delay_db)
x <- collect(tailnum_delay_db)
Five commonly used backends are:
library(sparklyr)
library(dplyr)
library(nycflights13)
library(ggplot2)
sc <- spark_connect(master="local")
flights <- copy_to(sc, flights, "flights")
airlines <- copy_to(sc, airlines, "airlines")
src_tbls(sc)
data <- flights %>%
filter(day == 17, month == 5, carrier %in% c('UA', 'WN', 'AA', 'DL')) %>%
select(year, month, day, carrier, dep_delay, air_time, distance) %>%
arrange(year, month, day, carrier) %>%
mutate(air_time_hours = air_time / 60) %>%
group_by(carrier) %>%
summarize(count = n(), mean_dep_delay = mean(dep_delay, na.rm = TRUE))
show_query(data)
carrierhours <- collect(data)
[ Demo ]
“The Tidyverse is where we want to be!”
Andrew Gelman
Prof. Columbia University
#1 STAN Core Developer
(among other things)
Tidyverse is coming to a galaxy near you!
(and you really should embrace it)
:-D