# Load needed libraries
library(tidyverse)
library(RCurl)
library(knitr)
library(stringr)

1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

# Source the file from the 538 Website github repository and set NA strings to 0
filename <- getURL("https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv")
majors_list <- read.csv(text = filename,na.strings = "")
head(majors_list, 10)
grep(pattern = 'Data', majors_list$Major, value = TRUE, ignore.case = TRUE)
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"
grep(pattern = 'statistics', majors_list$Major, value = TRUE, ignore.case = TRUE)
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [2] "STATISTICS AND DECISION SCIENCE"

2. Write code that transforms the data below:

filename <- getURL("https://raw.githubusercontent.com/audiorunner13/Masters-Coursework/main/DATA607%20Spring%202021/Week3/Data/veggies.txt")
berry_veg <- read.delim(text=filename, header = FALSE, quote = "")
berry_veg$V1
##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"
berry_veg_vec <-  str_c(berry_veg,sep = ", ")
## Warning in stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE):
## argument is not an atomic vector; coercing
writeLines(berry_veg_vec)
## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")

3. Describe, in words, what these expressions will match

This will return an error since it is not enclosed in quotes

# str_view(words, (.)\1\1, match = TRUE)

Find all words that have a pair of letters that repeat in any order

str_view(words,"(.)(.)\\2\\1", match = TRUE)

This will return an error since it is not enclosed in quotes

#str_view(words,(..)\1, match = TRUE)

Find all words that have a letter that repeats more than once

str_view(words,"(.).\\1.\\1", match = TRUE)

Find all words that have 3 letters that repeat 0 or more times

str_view(words,"(.)(.)(.).*\\3\\2\\1", match = TRUE)

4. Construct regular expressions to match words that:

Start and end with the same character

words[str_detect(words, "^a.*a$")]
## [1] "america" "area"
str_view(words,"^a.*a$", match = TRUE)

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

x <- c("church", "remember")
str_view(x,"(..)(..)\\1", match = TRUE)

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

test_word <- c("eleven", "twelve", "twentieth","ninconponop")
str_view(test_word,"(.)(.)(.).+\\1", match = TRUE)