Load required libraries.

tidyverse

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

1.

Read CSV file from Fivethirtyeight’s Github repo. Take a subset including the names of the majors.

filename <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv"

majors <- read.csv(filename, header=TRUE, sep=",")

majors <- subset(majors, select=2)

Use gsub() with regex to clean the data.

y <-gsub(pattern = "\"", replacement = "", majors)
y <- gsub(pattern = "\\n", replacement="", y)

z <- str_split(y, ", ")

Loop through the majors, looking for the ones containing the keywords `DATA` or `STATISTICS`.

a <- data.frame(as.list(z))


for (i in 1:nrow(a)) {
  m <- a[i,1]
  if (length(grep("DATA", m)) != 0) {
    print(m)
  }
  if (length(grep("STATISTICS", m)) != 0) {
    print(m)
  }
}

## [1] "MANAGEMENT INFORMATION SYSTEMS AND STATISTICS"
## [1] "COMPUTER PROGRAMMING AND DATA PROCESSING"
## [1] "STATISTICS AND DECISION SCIENCE"

2.

Input string to be transformed:

a <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"
[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  
[9] "elderberry"   "lime"         "lychee"       "mulberry"    
[13] "olive"        "salal berry"'

Regex for cleaning the string

x <-a 

b <- gsub('(\n)|(\\[|\\])|[0-9]', '', x)

b <- gsub('(\\s+)', ' ', b)


b <- gsub('\" ', '\", ', b)

c <-paste('c(', b ,')',sep="")
c <- gsub("c\\( ", "c\\(", c)

Finally, use writeLines() to see the raw transformed string.

writeLines(c)

## c("bell pepper", "bilberry", "blackberry", "blood orange", "blueberry", "cantaloupe", "chili pepper", "cloudberry", "elderberry", "lime", "lychee", "mulberry", "olive", "salal berry")

3.

Describe what the expressions will match.

(.)\1\1

#1 
z <- c("aabbc", "aba","a\1\1" ,"bcc", "xcccx", "church", "bcb", "ffffffxyzhhhhhhhf", "k", "banana", "eleven")
str_view(z, "(.)\1\1", match=TRUE)

This matches any character followed by 2 Start of Header (SOH) characters.

"(.)(.)\\2\\1"

#2 
z <- c("aabbc", "aba","a\1\1" ,"bcc", "xcccx", 
       "church", "bcb", "fxyzhhhhhhhf", 
       "k", "banana", "eleven", "naabbm",
       "abba", "\"woow\"")
str_view(z, '"(.)(.)\\2\\1"', match=TRUE)

This matches any string that contains a 4-character palindrome enclosed by double quotes. In other words, it will match strings that go forward the same as backwards with " on both ends.

(..)\1

#3
z <- c("aabbc", "aba","a\1\1" ,"bcc", "xcccx", 
       "church", "bcb", "aabjjjj", "hijklmnmn","ffffffxyzhhhhhhhf", 
       "k", "banana", "eleven", "ab\1", "bb\1", "cc", "gfrt\1",
       "abba")
str_view(z, "(..)\1", match=TRUE)

This will match any string that contains two characters followed by a SOH character.

"(.).\\1.\\1"

#4
z <- c("aabbc", "aba","a\1\1" ,"bcc", "xcccx", 
       "church", "bcb", "aabjjjj", "hijklmnmn","ffffffxyzhhhhhhhf", 
       "k", "banana", "eleven", "ab\1", "bb\1", "cc", "gfrt\1",
       "abba", "abaca", "\"abaca\"")

str_view(z, '"(.).\\1.\\1"', match=TRUE)

This will match a string enclosed in double quotes with a character that repeats twice with 1 character between each repetition.

"(.)(.)(.).*\\3\\2\\1" This will match a string that contains a substring enclosed within double quotes, that starts with 3 characters, then has any number of random other characters, then has the same first 3 characters mirrored. In other words, the substring is like a palindrome, except the reflection happens after some random characters in between.

4.

Construct regular expressions to match words that:

Start and end with the same character.

x <- c("abc", "aba", "bcc", "bcb", "ffffffxyzhhhhhhhf", "k")
str_view(x, "^(.).*\\1$", match=TRUE)

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

y <- c("aabbc", "aba", "bcc", "church", "bcb", "ffffffxyzhhhhhhhf", "k")
str_view(y, ".*(..).*(..).*", match=TRUE)

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

z <- c("aabbc", "aba", "bcc", "church", "bcb", "ffffffxyzhhhhhhhf", "k", "banana", "eleven")
str_view(z, ".*(.).*\\1.*\\1.*", match=TRUE)

A3

Tora Mullings

2/18/2022

Load required libraries.

tidyverse

1.

Read CSV file from Fivethirtyeight’s Github repo. Take a subset including the names of the majors.

Use gsub() with regex to clean the data.

Loop through the majors, looking for the ones containing the keywords `DATA` or `STATISTICS`.

2.

Input string to be transformed:

Regex for cleaning the string

Finally, use writeLines() to see the raw transformed string.

3.

Describe what the expressions will match.

4.

Construct regular expressions to match words that:

A3

Tora Mullings

2/18/2022

Load required libraries.

tidyverse

1.

Read CSV file from Fivethirtyeight’s Github repo. Take a subset including the names of the majors.

Use gsub() with regex to clean the data.

Loop through the majors, looking for the ones containing the keywords DATA or STATISTICS.

2.

Input string to be transformed:

Regex for cleaning the string

Finally, use writeLines() to see the raw transformed string.

3.

Describe what the expressions will match.

4.

Construct regular expressions to match words that:

Loop through the majors, looking for the ones containing the keywords `DATA` or `STATISTICS`.