library(tidyverse)
library(ggplot2)
library(stringr)
First we'll read in the dataset and then subset based on data/statistics strings
df.college.majors = read.csv( url("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"))
vec.majors = df.college.majors$Major[grep("DATA|STATISTICS", df.college.majors$Major)]
print(vec.majors)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [3] "STATISTICS AND DECISION SCIENCE"
To convert this to a properly formatted character string, we'll take the following steps:
vec.text = c('[1] "bell pepper" "bilberry" "blackberry" "blood orange"
[5] "blueberry" "cantaloupe" "chili pepper" "cloudberry"
[9] "elderberry" "lime" "lychee" "mulberry"
[13] "olive" "salal berry"')
vec.text.char = gsub("(\\n\\[\\d+\\])|(^\\[\\d+\\])", "", vec.text)
vec.text.char = strsplit(vec.text.char, '\\"')
vec.text.char = unlist(vec.text.char)
vec.text.char = vec.text.char[grep("[a-z]", vec.text.char)]
print(vec.text.char)
## [1] "bell pepper" "bilberry" "blackberry" "blood orange" "blueberry"
## [6] "cantaloupe" "chili pepper" "cloudberry" "elderberry" "lime"
## [11] "lychee" "mulberry" "olive" "salal berry"
1.
(.)\1\1
(.)(.)\\2\\1
(..)\1
"(.).\\1.\\1"
"(.)(.)(.).*\\3\\2\\1"
\, the pattern will flag any character, followed by two \1s. If (.)\\1\\1 is used, one character appears three times in a rowExample seen below:
str_detect("ZZZ", "(.)\1\1")
## [1] FALSE
str_detect("Z\1\1", "(.)\1\1")
## [1] TRUE
VS
str_detect("ZZZ", "(.)\\1\\1")
## [1] TRUE
str_detect("Z\1\1", "(.)\\1\\1")
## [1] FALSE
zooz)\, the pattern will flag two characters followed by \1. But if the escape character is included, then it will flag two characters (can be different) repeated consecutively (e.g. dodo)Example seen below:
str_detect("zz\1", "(..)\1")
## [1] TRUE
str_detect("dodo", "(..)\1")
## [1] FALSE
VS
str_detect("zz\1", "(..)\\1")
## [1] FALSE
str_detect("dodo", "(..)\\1")
## [1] TRUE
Construct regular expressions to match words that:
str_view("civic", "^(.).*\\1$", match = T)
str_view("church", "(..).*\\1", match = T)
str_view("eleven", "(.).*\\1.*\\1", match = T)