library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
  1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset, provide code that identifies the majors that contain either “DATA” or “STATISTICS”
url = "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
major_list = fread(url, header = TRUE)
data_statistic_major <- str_view(major_list$Major, "DATA|STATISTICS")
data_statistic_major
## [44] │ MANAGEMENT INFORMATION SYSTEMS AND <STATISTICS>
## [52] │ COMPUTER PROGRAMMING AND <DATA> PROCESSING
## [59] │ <STATISTICS> AND DECISION SCIENCE
  1. Write code that transforms the data below
fg <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"'

pattern <- '\\"[a-zA-Z]+\\ ?[a-zA-Z]+?\\"'
fruit_and_vegetable <- unlist(str_extract_all(fg, pattern))
fruits <- str_remove_all(fruit_and_vegetable, "\"")
fruits
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"
  1. Describe, in words, what these expressions will match:
  1. (.)\1\1: This regular expression will match any string that has three repeated characters, e.g.
str_view(c("caaarve", "trinnne", "craske"), "(.)\\1\\1")
## [1] │ c<aaa>rve
## [2] │ tri<nnn>e
str_detect(c("caaarve", "trinnne", "craske"), "(.)\\1\\1")
## [1]  TRUE  TRUE FALSE
  1. “(.)(.)\2\1”: This regular expression will match any string that has at least four character where from the first grouping the third character matches the second character and the fourth character matches the first. e.g.
str_view(c("caa", "raaretar", "trrt", "abba", "bassat"), "(.)(.)\\2\\1")
## [2] │ <raar>etar
## [3] │ <trrt>
## [4] │ <abba>
## [5] │ b<assa>t
str_detect(c("caa", "raaretar", "trrt", "abba", "bassat"), "(.)(.)\\2\\1")
## [1] FALSE  TRUE  TRUE  TRUE  TRUE
  1. (..)\1: This regular expression will match any string that has at least four character where from the first grouping the next two consecutive characters match the first two preceding characters. e.g.
str_view(c("caac", "raarar", "trtr", "abba", "sat"), "(..)\\1")
## [2] │ ra<arar>
## [3] │ <trtr>
str_detect(c("caac", "raarar", "trtr", "abba", "sat"), "(..)\\1")
## [1] FALSE  TRUE  TRUE FALSE FALSE
  1. “(.).\1.\1”: This regular expression will match any string that has at least five characters where from the first grouping the third and fifth characters match the first characters. The second and fourth character don’t need to match. e.g.
str_view(c("cacrcab", "rbti", "grgrg", "btvqvrv"), "(.).\\1.\\1")
## [1] │ <cacrc>ab
## [3] │ <grgrg>
## [4] │ bt<vqvrv>
str_detect(c("cacrcab", "rbti", "grgrg", "btvqvrv"), "(.).\\1.\\1")
## [1]  TRUE FALSE  TRUE  TRUE
  1. **“(.)(.)(.).*\3\2\1”**: This regular expression will match any strings that has at least six characters where from the first three grouping there might be a bunch of characters but then the first three grouping repeat themselves but in reverse order. e.g.
str_view(c("abccba", "gretcdterab", "gvbe", "qrstuv"), "(.)(.)(.).*\\3\\2\\1")
## [1] │ <abccba>
## [2] │ g<retcdter>ab
str_detect(c("abccba", "gretcdterab", "gvbe", "qrstuv"), "(.)(.)(.).*\\3\\2\\1")
## [1]  TRUE  TRUE FALSE FALSE
  1. Construct regular expressions to match words that:
  1. Start and end with the same character: **^([a-zA-Z]).*\1$** e.g.
str_view(c("apple", "anana", "abba", "creerp"), "^([a-zA-Z]).*\\1$")
## [2] │ <anana>
## [3] │ <abba>
  1. Contain a repeated pair of letters: **[a-zA-Z][a-zA-Z]).*\1** e.g.
str_view(fruits, "([a-zA-Z][a-zA-Z]).*\\1")
##  [1] │ bell <peppe>r
##  [7] │ chili <peppe>r
##  [9] │ eld<erber>ry
## [14] │ s<alal> berry
  1. Contain one letter repeated in at least three places: ([a-zA-Z]).\1.\1. e.g.
str_view(fruits, "([a-zA-Z]).*\\1.*\\1.")
## [1] │ b<ell pepper>
## [4] │ bl<ood or>ange
## [7] │ chili <peppe>r
## [9] │ <elderber>ry