Assignment 2 – Character Manipulation & Data Processing

library(tidyverse)

1) "DATA" or "STATISTICS" College Majors

# Load college majors CSV
college_majors <- read.csv('majors-list.csv')

# Preview data
glimpse(college_majors)

# Filter for only majors containing "DATA" or "STATISTICS"
data_or_stat_majors <- college_majors %>% filter(grepl('DATA|STATISTICS', Major))
data_or_stat_majors

2) Transforming data

# Load string from text file
fruit_str <- paste(readLines("fruits.txt"), collapse=" ")
fruit_str

# Match with any substring in-between double quotes and extract
is_fruit <- str_subset(fruit_str, '(?!^)".*?\"')
matches <- str_extract_all(fruit_str, '(?!^)".*?\"')

# Convert from list to character vector
fruit_vector_raw <- unlist(matches[[1]])

# Clean up by removing backslashes and extraneous double quotes
fruit_vector <- str_replace_all(fruit_vector_raw, c('\"' = ""))
fruit_vector

# Check that result is a character vector
typeof(fruit_vector)

3) Describe regular expressions

"(.)\1\1" - Matches to characters repeated three times consecutively.

str_view("jlfjsdaaabdfdsj", '(.)\\1\\1', match = TRUE)

"(.)(.)\2\1" - Matches when two characters are consecutively repeated in reverse order.

str_view("jkdfajmllmjlksdfs", '(.)(.)\\2\\1', match = TRUE)
# Or when including quotes
str_view('jkdfaj"mllm"jlksdfs', '"(.)(.)\\2\\1"', match = TRUE)

"(..)\1" - Matches when two characters are consecutively repeated.

str_view("jkdfajmlmljlksdfs", '(..)\\1', match = TRUE)

"(.).\1.\1" - Matches when a character is seen three times with one character between each instance (e.g. "lelal", since "l" is seen three times with a character between each observation)

str_view("fdsfslelallkjd", '(.).\\1.\\1', match = TRUE)
# Or when including quotes
str_view('fdsfs"lelal"lkjd', '"(.).\\1.\\1"', match = TRUE)

"(.)(.)(.).*\3\2\1" - Matches to characters bounded by and inclusive of three characters repeated in reverse order (e.g. "abc" and "cba") with any number of characters in-between.

str_view("jdkafdabcjkfajdsfcbajkdfsd", '(.)(.)(.).*\\3\\2\\1', match = TRUE)
# Or when including quotes
str_view('jdkafd"abcjkfajdsfcba"jkdfsd', '"(.)(.)(.).*\\3\\2\\1"', match = TRUE)

4) Construct regular expressions

# Start and end with the same character.
sample_txt <- "This will be my test text. Abba, so that I can see if cac will 5five5?"
sample_words <- strsplit(sample_txt, " ")
sample_words <- tolower(unlist(sample_words))
sample_words <- str_replace_all(sample_words, c(',|\\.|\\?' = ""))
sample_words
str_view(sample_words, '^(.).*\\1$', match = TRUE)

# Contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
sample_txt_two <- "This will be my rethath text. Abba, so that I can see if church will work?"
sample_words_two <- strsplit(sample_txt_two, " ")
sample_words_two <- tolower(unlist(sample_words_two))
sample_words_two <- str_replace_all(sample_words_two, c(',|\\.|\\?' = ""))
sample_words_two
str_view(sample_words_two, '\\b\\w*([a-z]{2}).*?(\\1)\\w*\\b', match = TRUE)

# Contain one letter repeated in at least three places (e.g. "eleven" contains three "e"s.)
sample_txt_three <- "This will be my eleven text. Abba, so that I can see if banana will work?"
sample_words_three <- strsplit(sample_txt_three, " ")
sample_words_three <- tolower(unlist(sample_words_three))
sample_words_three <- str_replace_all(sample_words_three, c(',|\\.|\\?' = ""))
sample_words_three
str_view(sample_words_three, '\\b\\w*([a-z]).*\\1.*\\1\\w*\\b', match = TRUE)

This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.

Assignment 2 – Character Manipulation & Data Processing

Matthew Lucich

1) "DATA" or "STATISTICS" College Majors

2) Transforming data

3) Describe regular expressions

4) Construct regular expressions