library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”
major_data <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## FOD1P = col_character(),
## Major = col_character(),
## Major_Category = col_character()
## )
head(major_data)
## # A tibble: 6 x 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 1100 GENERAL AGRICULTURE Agriculture & Natural Resources
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT Agriculture & Natural Resources
## 3 1102 AGRICULTURAL ECONOMICS Agriculture & Natural Resources
## 4 1103 ANIMAL SCIENCES Agriculture & Natural Resources
## 5 1104 FOOD SCIENCE Agriculture & Natural Resources
## 6 1105 PLANT SCIENCE AND AGRONOMY Agriculture & Natural Resources
filtered <- dplyr::filter(major_data, grepl('DATA|STATISTICS', Major))
head(filtered)
## # A tibble: 3 x 3
## FOD1P Major Major_Category
## <chr> <chr> <chr>
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
## 3 3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics
Write code that transforms the data below:
[1] “bell pepper” “bilberry” “blackberry” “blood orange”
[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”
Into a format like this:
c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)
Step 1: Copy in the data
fruit_data <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
fruit_data
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n\n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n\n[13] \"olive\" \"salal berry\""
Step 2: Remove undesired characters
fruit_intermediate <- str_extract_all(fruit_data, pattern = '[Za-z]+.?[Za-z]+')
fruit_intermediate
## [[1]]
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
Step 3: Collapse the data
fruit_final <- str_c(fruit_intermediate, collapse = ", ")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## argument is not an atomic vector; coercing
fruit_final
## [1] "c(\"bell pepper\", \"bilberry\", \"blackberry\", \"blood orange\", \"blueberry\", \"cantaloupe\", \"chili pepper\", \"cloudberry\", \"elderberry\", \"lime\", \"lychee\", \"mulberry\", \"olive\", \"salal berry\")"
Step 4: Use write lines to print out the data without the backslash
writeLines(fruit_final)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
Note, quotes were added around the first and third expressions, otherwise they would not work. (.)\1\1 will match any character appearing three times in a row, eg the three m’s in “hmmm” (.)(.)\2\1 will match palindromes where the first two characters are the second two characters in reverse, eg “ahha” (..)\1 will match a group of two characters that repeats immediately afterwards, eg “uhuh” (.).\1.\1 will match an initial character followed by a group of two characters (a second character and the initial character) repeated twice, eg ahaha (.)(.)(.).*\3\2\1 will match a group of three characters followed by a group of the same three characters in reverse. There can be any number of characters between the two group eg “ahhhha”, “ummnooooooommu”
programmer_sounds <- c("hmmm", "ahha", "uhuh", "bwahaha", "ahhhha", "ummnooooooommu")
str_view(programmer_sounds,"(.)\\1\\1")
str_view(programmer_sounds, "(.)(.)\\2\\1")
str_view(programmer_sounds, '(..)\\1')
str_view(programmer_sounds, '(.).\\1.\\1')
str_view(programmer_sounds, "(.)(.)(.).*\\3\\2\\1")
Construct regular expressions to match words that:
Start and end with the same character. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str_view(c("pip", "doomed"), "(.).*\\1")
str_view(c("church","booboo"),"(..).*\\1")
str_view(c("eleven", "seventeen"), "(.).*\\1.*\\1")