Required libraries

library(tidyverse)
library(rvest)

Question 1

Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset, provide code that identifies the majors that contain either “DATA” or “STATISTICS”

url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"
df <- read_csv(url)

Method 1

word_search <- list("statistics", "data")
majors_data_stats <- list()

for (word in word_search){
  majors_data_stats <- append(majors_data_stats, grep(word, df$Major, ignore.case = TRUE, value = TRUE))
}

print(majors_data_stats)
## [[1]]
## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## 
## [[2]]
## [1] "STATISTICS AND DECISION SCIENCE"
## 
## [[3]]
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"

Method 2

knitr::kable(df %>%
  filter(str_detect(Major, regex("statistics|data", ignore_case = TRUE))))
FOD1P Major Major_Category
6212 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS Business
2101 COMPUTER PROGRAMMING AND DATA PROCESSING Computers & Mathematics
3702 STATISTICS AND DECISION SCIENCE Computers & Mathematics

Question 2

Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange”

[5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”

[9] “elderberry” “lime” “lychee” “mulberry”

[13] “olive” “salal berry”

Into a format like this:

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

start_str <- '[1] "bell pepper" "bilberry"   "blackberry"  "blood orange"
[5] "blueberry"  "cantaloupe"  "chili pepper" "cloudberry"
[9] "elderberry"  "lime"     "lychee"    "mulberry"
[13] "olive"    "salal berry"'

patterns <- c('\\[' = '', '[0-9]'='', '\\]'='', '\n'='')
remove_chars <- str_replace_all(start_str, patterns)
remove_backslash <- str_split(remove_chars, '\\"')
remove_quotes <- str_split(remove_backslash, '\\"')

df_words <- as.data.frame(str_extract_all(remove_quotes, "(\"\\w*[:blank:]?\\w*\")+", simplify=TRUE))
df_words <- as.data.frame(df_words[!apply(df_words, 1, function(x) any(x=="")),])

df_t <- transpose(df_words)
df_t <- as.data.frame(lapply(df_t, unlist))

df_final <- df_t %>% filter(grepl('[a-zA-Z]', df_t[ , 1]))

update_string <- ''

for (word in df_final){
  update_string <- paste(update_string, word, collapse = ', ')
}

update_string <- paste0('c(', update_string) %>%
  paste0(update_string, ')')

writeLines(update_string)
## c( "bell pepper",  "bilberry",  "blackberry",  "blood orange",  "blueberry",  "cantaloupe",  "chili pepper",  "cloudberry",  "elderberry",  "lime",  "lychee",  "mulberry",  "olive",  "salal berry" "bell pepper",  "bilberry",  "blackberry",  "blood orange",  "blueberry",  "cantaloupe",  "chili pepper",  "cloudberry",  "elderberry",  "lime",  "lychee",  "mulberry",  "olive",  "salal berry")

Question 3

Describe, in words, what these expressions will match:

(.)\1\1 Same character repeated three times

“(.)(.)\2\1” Two consecutive characters that is backwards

(..)\1 Two same characters repeated

“(.).\1.\1” Every other character should be repeated

“(.)(.)(.).*\3\2\1” Three characters separated by any number of different repeated characters including none, then the three characters backwards.


Question 4

Construct regular expressions to match words that:

Start and end with the same character. “^(.)((.*\1)|1?)”

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) “([A-Za-z][A-Za-z]).*\1”

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.) “([A-Za-z]).\1.\1”