library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
theLink <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
# load data into data frame
df_majors <- read.csv(file=theLink, header = TRUE, sep = ",")
head(df_majors)
## FOD1P Major
## 1 1100 GENERAL AGRICULTURE
## 2 1101 AGRICULTURE PRODUCTION AND MANAGEMENT
## 3 1102 AGRICULTURAL ECONOMICS
## 4 1103 ANIMAL SCIENCES
## 5 1104 FOOD SCIENCE
## 6 1105 PLANT SCIENCE AND AGRONOMY
## Major_Category
## 1 Agriculture & Natural Resources
## 2 Agriculture & Natural Resources
## 3 Agriculture & Natural Resources
## 4 Agriculture & Natural Resources
## 5 Agriculture & Natural Resources
## 6 Agriculture & Natural Resources
df_majors %>%
filter(str_detect(Major, "DATA|STATISTICS"))
## FOD1P Major
## 1 6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS
## 2 2101 COMPUTER PROGRAMMING AND DATA PROCESSING
## 3 3702 STATISTICS AND DECISION SCIENCE
## Major_Category
## 1 Business
## 2 Computers & Mathematics
## 3 Computers & Mathematics
given_str <- '[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"'
given_str
## [1] "[1] \"bell pepper\" \"bilberry\" \"blackberry\" \"blood orange\"\n[5] \"blueberry\" \"cantaloupe\" \"chili pepper\" \"cloudberry\" \n[9] \"elderberry\" \"lime\" \"lychee\" \"mulberry\" \n[13] \"olive\" \"salal berry\""
# matches [a-z]+ for 1+ times
extracted_str <- str_extract_all(given_str, '[a-z]+\\s[a-z]+|[a-z]+')
extracted_str
## [[1]]
## [1] "bell pepper" "bilberry" "blackberry" "blood orange"
## [5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
## [9] "elderberry" "lime" "lychee" "mulberry"
## [13] "olive" "salal berry"
gsub('[\"]', '', extracted_str)
## [1] "c(bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry)"
The two exercises below are taken from R for Data Science, 14.3.5.1 in the on-line version:
#3 Describe, in words, what these expressions will match:
(.)\1\1
(.)(.)\2\1
(..)\1
(.).\1.\1
(.)(.)(.).*\3\2\1
#4 Construct regular expressions to match words that:
Start and end with the same character.
str.4.1 <- c("ZOO","YAML","SAS","TOY","MOM")
pattern.4.1 <- "^(.).*\\1$"
str.4.1 %>%
str_subset(pattern.4.1)
## [1] "SAS" "MOM"
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
str.4.2 <- c("church","papa","apple")
pattern.4.2 <- "(.)(.).*\\1"
str.4.2 %>%
str_subset(pattern.4.2)
## [1] "church" "papa"
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str.4.3 <- c("eleven","kjkjkjkj","apple")
pattern.4.3 <- "(.).\\1.\\1"
str.4.3 %>%
str_subset(pattern.4.3)
## [1] "eleven" "kjkjkjkj"