We want to get inspections for coffee shops. Let’s say a coffee shop is anything that has “COFFEE”, “ESPRESSO”, or “ROASTER” in the name. The regex for this is COFFEE|ESPRESSO|ROASTER because | is a metacharacter that means “OR”. Use the str_detect() function, which returns TRUE if it finds what you’re looking for and FALSE if it doesn’t
load("restaurants_sub.RData")
coffee_data <- restaurants %>% # don't need to do like restaurants <- load("restaurants_sub.Rdata")
mutate(Name = str_to_upper(Name)) %>% # capitalize Name column
filter(str_detect(Name, "COFFEE|ESPRESSO|ROASTER")) # find stores including COFFEE ESPRESSO or ROASTER
coffee_data %>%
distinct(Name) %>% # remove duplicate
head() # see first few rowsFor the following vector of randomly generated names, write a regular expression that:
thenames <- c("Jeremy Cruz", "Nathaniel Le", "Jasmine Chu",
"Bradley Calderon Raygoza", "Quinten Weller",
"Katelien Kanamu-Hauanio", "Zuhriyaa al-Amen",
"Travale York", "Alexis Ahmed", "David Alcocer",
"Jairo Martinez", "Dwone Gallegos", "Amanda Sherwood",
"Hadiyya el-Eid", "Shaimaaa al-Can", "Sarah Love",
"Shelby Villano", "Sundus al-Hashmi", "Dyani Loving",
"Shanelle Douglas")#First part
str_detect(thenames, "^[aeiouAEIOU]")## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
thenames[str_detect(thenames, "^[aeiouAEIOU]")] # can check if I got all the person's name starts with a vowel## [1] "Alexis Ahmed" "Amanda Sherwood"
#Second part
# my answer
str_detect(thenames, "\\s\\b[aeiouAEIOU]")## [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
## [13] FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
thenames[str_detect(thenames, "\\s\\b[aeiouAEIOU]")]## [1] "Zuhriyaa al-Amen" "Alexis Ahmed" "David Alcocer" "Hadiyya el-Eid"
## [5] "Shaimaaa al-Can" "Sundus al-Hashmi"
# professor's answer
thenames[str_detect(thenames, " [aeiouAEIOU]")]## [1] "Zuhriyaa al-Amen" "Alexis Ahmed" "David Alcocer" "Hadiyya el-Eid"
## [5] "Shaimaaa al-Can" "Sundus al-Hashmi"
#Third part (note that both of these will work)
# my answer
str_detect(thenames, "^[aeiouAEIOU]|\\s\\b[aeiouAEIOU]")## [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
## [13] TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
thenames[str_detect(thenames, "^[aeiouAEIOU]|\\s\\b[aeiouAEIOU]")]## [1] "Zuhriyaa al-Amen" "Alexis Ahmed" "David Alcocer" "Amanda Sherwood"
## [5] "Hadiyya el-Eid" "Shaimaaa al-Can" "Sundus al-Hashmi"
# professor's answer
thenames[str_detect(thenames, "^[AEIOU].*| [AEIOUaeiou].*")]## [1] "Zuhriyaa al-Amen" "Alexis Ahmed" "David Alcocer" "Amanda Sherwood"
## [5] "Hadiyya el-Eid" "Shaimaaa al-Can" "Sundus al-Hashmi"
thenames[str_detect(thenames, "\\b[AEIOU]")]## [1] "Zuhriyaa al-Amen" "Alexis Ahmed" "David Alcocer" "Amanda Sherwood"
## [5] "Hadiyya el-Eid"
#Fourth part (note that both of these will work)
# my answer
thenames[str_detect(thenames, "^[^[aeiouAEIOU]].*\\s\\b[^aeiouAEIOU].*")] ## [1] "Jeremy Cruz" "Nathaniel Le"
## [3] "Jasmine Chu" "Bradley Calderon Raygoza"
## [5] "Quinten Weller" "Katelien Kanamu-Hauanio"
## [7] "Travale York" "Jairo Martinez"
## [9] "Dwone Gallegos" "Sarah Love"
## [11] "Shelby Villano" "Dyani Loving"
## [13] "Shanelle Douglas"
# professor's answer
thenames[str_detect(thenames, "^[^AEIOU].* [^aeiouAEIOU].*")] ## [1] "Jeremy Cruz" "Nathaniel Le"
## [3] "Jasmine Chu" "Bradley Calderon Raygoza"
## [5] "Quinten Weller" "Katelien Kanamu-Hauanio"
## [7] "Travale York" "Jairo Martinez"
## [9] "Dwone Gallegos" "Sarah Love"
## [11] "Shelby Villano" "Dyani Loving"
## [13] "Shanelle Douglas"
thenames[str_detect(thenames, "^[^AEIOU].* [^aeiouAEIOU]-?[^AEIOU]")] ## [1] "Jeremy Cruz" "Nathaniel Le"
## [3] "Jasmine Chu" "Bradley Calderon Raygoza"
## [5] "Quinten Weller" "Katelien Kanamu-Hauanio"
## [7] "Travale York" "Jairo Martinez"
## [9] "Dwone Gallegos" "Sarah Love"
## [11] "Shelby Villano" "Dyani Loving"
## [13] "Shanelle Douglas"
Consider the following string vector
text = c("apple","219 733 8965","329-293-8753","Work: (579) 499-7527; Home: (543) 355 3679")#first part
patt1 <- "\\(?\\d{3}\\)?-? ?\\d{3}-? ?\\d{4}"
str_extract_all(text, patt1)## [[1]]
## character(0)
##
## [[2]]
## [1] "219 733 8965"
##
## [[3]]
## [1] "329-293-8753"
##
## [[4]]
## [1] "(579) 499-7527" "(543) 355 3679"
#second part
# first method
str_extract(unlist(str_extract_all(text, patt1)),"(\\d{3})") ## [1] "219" "329" "579" "543"
# second method
patt2 <- "\\(?(\\d{3})\\)?[- ].*?(\\d{3}[- ]\\d{4})\\)?"
map(str_match_all(text, patt2),
.f = function(x){x[, 2, drop = FALSE]}) %>%
unlist()## [1] "219" "329" "579" "543"
Use the appropriate lubridate function to parse each of the following dates:
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
d6 <- list("August 19 (2015)", "July 1 (2015)")
mdy(d1)## [1] "2010-01-01"
ymd(d2)## [1] "2015-03-07"
dmy(d3)## [1] "2017-06-06"
mdy(d4)## [1] "2015-08-19" "2015-07-01"
mdy(d5)## [1] "2014-12-30"
mdy(d6)## [1] "2015-08-19" "2015-07-01"
library(nycflights13)
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt %>%
mutate(dep_day = wday(dep_time, label = T)) %>%
group_by(dep_day) %>%
filter(!is.na(arr_delay)) %>%
select(arr_delay, dep_day) %>%
summarise(avg_delay = median(arr_delay, na.rm = T), n = n()) %>%
ggplot(aes(dep_day, avg_delay)) +
geom_bar(stat = "identity", fill = "purple") +
xlab("day of the week") + ylab("median delay (mins)") +
ggthemes::theme_economist()#your code here
flights_dt %>%
filter(!is.na(arr_delay)) %>%
select(arr_delay, dep_time, sched_dep_time) %>%
mutate(early_dep = factor(ifelse(dep_time < sched_dep_time, 1, 0),
levels = c(1,0),
labels = c("early departure", "on-time/late departure")),
minute = minute(dep_time)) %>%
group_by(minute, early_dep) %>%
summarise(avg_delay = mean(arr_delay, na.rm = T),
n = n(),
.groups = 'drop') %>% # remove warning msg
ggplot(aes(x = minute, y = n)) +
geom_bar(stat = "identity", aes(fill = early_dep)) +
facet_wrap(early_dep~.) +
xlab("departure minute") +
ylab("number of flights") +
ggthemes::theme_economist_white(gray_bg = F) +
theme(legend.position = "none")