Jesse Yang
February 23, 2017
The tidyverse is a collection of R packages that share common philosophies and are designed to work together.
library(tidyverse) # load above all
dat <- read.csv("AirlineDelays.txt")
dat <- read_csv("AirlineDelays.txt")
readr
subset(dat, CARRIER == "AA",
select = c("DEP_DELAY", "ARR_DELAY"))
or
dat[dat$CARRIER == "AA",
c("DEP_DELAY", "ARR_DELAY")]
filter(dat, CARRIER == "AA") %>%
select(DEP_DELAY, ARR_DELAY)
which is equivalent to
select(filter(dat, CARRIER == "AA"),
DEP_DELAY, ARR_DELAY)
x <- table(dat$ORIGIN, dat$DEST)
x <- as.data.frame(x)
colnames(x) <-
c("ORIGIN", "DEST", "n")
x[order(x$n, decreasing = TRUE), ]
dat %>%
count(ORIGIN, DEST, order = TRUE)
dat %>%
count(ORIGIN, DEST, sort = TRUE) %>%
head() %>% knitr::kable()
| ORIGIN | DEST | n |
|---|---|---|
| SFO | LAX | 1109 |
| LAX | SFO | 1092 |
| LAX | LAS | 1033 |
| LAS | LAX | 1032 |
| JFK | LAX | 921 |
| LAX | JFK | 919 |
aggregate(DEP_DELAY ~ CARRIER, dat, mean)
dat %>%
group_by(CARRIER) %>%
summarise(
dep = mean(DEP_DELAY, na.rm = TRUE))
dplyr is also much fasterdat[sample(1:nrow(dat), 10), ]
sample_n(dat, 10)
arrange orders of rows by the value of columns;select columnsdat %>%
select(CARRIER, ORIGIN, DEST, ARR_DELAY) %>%
# `desc` means descending order
arrange(desc(ARR_DELAY), CARRIER) %>%
head(4)
# A tibble: 4 × 4
CARRIER ORIGIN DEST ARR_DELAY
<chr> <chr> <chr> <int>
1 AA BNA DFW 1530
2 AA ONT DFW 1504
3 AA CMH LAX 1473
4 AA IAD DFW 1448
dat %>%
group_by(ORIGIN, DEST) %>%
select(ARR_DELAY, DEP_DELAY) %>%
summarise(
arr = mean(ARR_DELAY, na.rm = TRUE),
dep = mean(DEP_DELAY, na.rm = TRUE)
) %>%
filter(arr > dep) %>%
arrange(desc(arr), desc(dep), ORIGIN) %>%
head(5)
Source: local data frame [5 x 4]
Groups: ORIGIN [5]
ORIGIN DEST arr dep
<chr> <chr> <dbl> <dbl>
1 ORD PNS 164.0 150.3333
2 CAK DTW 150.0 146.0000
3 IND LGA 142.5 59.5000
4 PNS ORD 140.5 132.2500
5 HDN EWR 132.0 126.0000
x <- c("Jan to Mar", "Jan 1 to Sep 30", "June",
"Aug 1, 2016 to Sep 1, 2016")
tmp <- str_split(x, " to ", simplify = TRUE)
tmp
[,1] [,2]
[1,] "Jan" "Mar"
[2,] "Jan 1" "Sep 30"
[3,] "June" ""
[4,] "Aug 1, 2016" "Sep 1, 2016"
begin.date <- tmp[, 1]
end.date <- tmp[, 2]
CompleteDate <- function(dates, day = 1, year = 2016) {
idx <- -which(str_detect(dates, "\\b[0-9]{1,2}\\b"))
dates[idx] <- sprintf("%s %s, %s", dates[idx], day, year)
idx <- -which(str_detect(dates, "\\b[0-9]{4}\\s*$"))
dates[idx] <- str_c(dates[idx], year, sep = ", ")
dates
}
begin.date <- CompleteDate(begin.date)
end.date <- CompleteDate(end.date, day = 28)
begin.date
[1] "Jan 1, 2016" "Jan 1, 2016" "June 1, 2016" "Aug 1, 2016"
end.date
[1] "Mar 28, 2016" "Sep 30, 2016" " 28, 2016" "Sep 1, 2016"
library(lubridate)
data.frame(begin = mdy(begin.date), end = mdy(end.date)) %>%
mutate(duration = end - begin)
begin end duration
1 2016-01-01 2016-03-28 87 days
2 2016-01-01 2016-09-30 273 days
3 2016-06-01 <NA> NA days
4 2016-08-01 2016-09-01 31 days
^ $ * + . ? [ ] { } ( ) | \\' \" \n \t \b? * + {n} {n,} {n,m}? after quantifiers\1 to \9x <- c("ababa", "aabb", "bbaa")
str_match(x, "(a*)(b*)")
[,1] [,2] [,3]
[1,] "ab" "a" "b"
[2,] "aabb" "aa" "bb"
[3,] "bb" "" "bb"
str_match(x, "(a+)(b+)")
[,1] [,2] [,3]
[1,] "ab" "a" "b"
[2,] "aabb" "aa" "bb"
[3,] NA NA NA
x <- c("ababa", "aabb", "bbaa", "aaab")
str_match(x, "a{2}")
[,1]
[1,] NA
[2,] "aa"
[3,] "aa"
[4,] "aa"
str_match(x, "a{1,2}")
[,1]
[1,] "a"
[2,] "aa"
[3,] "aa"
[4,] "aa"
x <- "Hello (2010) 2010"
str_match(x, "(.*)([0-9]{4})")
[,1] [,2] [,3]
[1,] "Hello (2010) 2010" "Hello (2010) " "2010"
str_match(x, "(.*?)([0-9]{4})")
[,1] [,2] [,3]
[1,] "Hello (2010" "Hello (" "2010"
str_match(x, "(.*?) +(?:\\([0-9]{4}\\))? +([0-9]{4})")
[,1] [,2] [,3]
[1,] "Hello (2010) 2010" "Hello" "2010"
pattern <- "^ *([A-Z][a-z]+) +(\\d{1,2}), +(\\d{4})"
dates <- c("Jan 1, 2017", " Sep 2, 2011", "Dec 1, 95")
matched <- str_match(dates, pattern)
matched
[,1] [,2] [,3] [,4]
[1,] "Jan 1, 2017" "Jan" "1" "2017"
[2,] " Sep 2, 2011" "Sep" "2" "2011"
[3,] NA NA NA NA
matched %>% .[, 2:4] %>% as_tibble() %>%
rename(month = V1, day = V2, year = V3)
# A tibble: 3 × 3
month day year
<chr> <chr> <chr>
1 Jan 1 2017
2 Sep 2 2011
3 <NA> <NA> <NA>