Part 1

College Majors

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
CollegeM <- getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv") 
collegeMajors <- data.frame(read.csv(text = CollegeM, header=TRUE))
head(collegeMajors)
collegeMajors %>%
  filter(str_detect(collegeMajors$Major, "STATISTICS|DATA"));

Part 2

Transformation

x <- c("bell pepper", "bilberry", "blackberry","blood orange","blueberry","cantaloupe","chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")
x
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"
str_c(c(x), collapse = ", ")
## [1] "bell pepper, bilberry, blackberry, blood orange, blueberry, cantaloupe, chili pepper, cloudberry, elderberry, lime, lychee, mulberry, olive, salal berry"

Part 3

x <- c("apple", "banana", "kiki", "pear","Mississippi","azzzbra", "abaca", "baba", "abba", "ababa", "abbcccddddcccbba")

“(.)\1\1” This will look for a character that is the same three times in a row.

str_view(x,"(.)\\1\\1", match = TRUE)

“(.)(.)\2\1” This will find where the is a character followed by a pair of characters and then the same character at the beginning. For Mississippi, it is finding the “issi” pattern.

str_view(x, "(.)(.)\\2\\1", match = TRUE)

“(..)\1” This will look for repeated patterns like “baba”.

str_view(x, "(..)\\1", match = TRUE)

“(.).\1.\1” This will look for a character and then another character then the 1st character again and then another character and then the first character again.

str_view(x, "(..)\\1", match = TRUE)

"(.)(.)(.).*\3\2\1" This expression looks for three characters and then any in between and then the same characters in reverse order. For example, “ississi” in Mississippi.

str_view(x, "(.)(.)(.).*\\3\\2\\1", match = TRUE)

Part 4

y <- c("apple", "banana", "kiki", "pear","Mississippi","azzzbra", "abaca", "baba", "abba", "ababa", "abbcccddddcccbba", "church", "eleven")

Start and end with the same character: Uses the anchors of ^ at the beginning of the string and $ at the end of the string.

str_subset(y, "^(.)((.*\\1$)|\\1?$)")
## [1] "azzzbra"          "abaca"            "abba"             "ababa"           
## [5] "abbcccddddcccbba"

Contain a repeated pair of letters: searches through the string to look whre there is a repeating pattern of combined letters such as “kiki”.

str_subset(y, "([A-Za-z][A-Za-z]).*\\1")
## [1] "banana"           "kiki"             "Mississippi"      "baba"            
## [5] "ababa"            "abbcccddddcccbba" "church"

Contain one letter repeated in at least 3 places: Searches throught the letters searching for a letter that is used 3 times.

str_subset(y, "([a-z]).*\\1.\\1")
## [1] "banana"           "Mississippi"      "abaca"            "ababa"           
## [5] "abbcccddddcccbba" "eleven"