Data607_Assignment3

Data607: Assignment 3 String Manipulation & RegEx

Load R Libraries

#knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Pull the appropriate majors from the FiveThirtyEight list

#1. Using the 173 majors listed in fivethirtyeight.com’s College Majors dataset [https://fivethirtyeight.com/features/the-economic-guide-to-picking-a-college-major/], provide code that identifies the majors that contain either “DATA” or “STATISTICS”

raw_file = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/majors-list.csv'
majors <- as_tibble(read.csv(raw_file))
majors_sub <- majors %>% 
    mutate(matched_name = str_match(Major,'.*DATA.*|.*STATISTICS.*')) %>%
    select(Major,matched_name) %>%
    filter(!is.na(matched_name))
majors_sub

## # A tibble: 3 × 2
##   Major                                         matched_name[,1]                
##   <chr>                                         <chr>                           
## 1 MANAGEMENT INFORMATION SYSTEMS AND STATISTICS MANAGEMENT INFORMATION SYSTEMS …
## 2 COMPUTER PROGRAMMING AND DATA PROCESSING      COMPUTER PROGRAMMING AND DATA P…
## 3 STATISTICS AND DECISION SCIENCE               STATISTICS AND DECISION SCIENCE

Tranform string data

#2 Write code that transforms the data below:

[1] “bell pepper” “bilberry” “blackberry” “blood orange” [5] “blueberry” “cantaloupe” “chili pepper” “cloudberry”
[9] “elderberry” “lime” “lychee” “mulberry”
[13] “olive” “salal berry”

c(“bell pepper”, “bilberry”, “blackberry”, “blood orange”, “blueberry”, “cantaloupe”, “chili pepper”, “cloudberry”, “elderberry”, “lime”, “lychee”, “mulberry”, “olive”, “salal berry”)

#2 Write code that transforms the data below:
str1 <- '[1] "bell pepper"  "bilberry"     "blackberry"   "blood orange"'
str2 <- '[5] "blueberry"    "cantaloupe"   "chili pepper" "cloudberry"  '
str3 <-'[9] "elderberry"   "lime"         "lychee"       "mulberry"    '
str4 <- '[13] "olive"        "salal berry"'
#Into a format like this:  character vector
clean_str_vec <-function(str_ex) {
    # 1) Replace all of the one or more repeated spaces that are starting and ending with double quotes
    # 2) Replace any characters that aren't letters, pipe, or spaces with blanks
    # 3) Break the string on pipe character into an k sized list based on number of pipe characters
    # 4) Combine lists into an atomic vector
    unlist(str_split(str_trim(str_replace_all(str_replace_all(str_ex,'\\"\\s{1,}\\"','\\|'),'[^a-z\\|\\s]','')),"\\|"))
}
new_format_vec <- str_sort(c(clean_str_vec(str1),clean_str_vec(str2),clean_str_vec(str3),clean_str_vec(str4)))

new_format_vec

##  [1] "bell pepper"  "bilberry"     "blackberry"   "blood orange" "blueberry"   
##  [6] "cantaloupe"   "chili pepper" "cloudberry"   "elderberry"   "lime"        
## [11] "lychee"       "mulberry"     "olive"        "salal berry"

Provided a more detailed description of each step within the nested functions to clean and prepare the string for this transformation in the function. This will also be helpful reference after some times away from the code to reorient myself with the flow of each part of the syntax.

Explain the Regex patterns in plain english

#3 Describe, in words, what these expressions will match:

(.)\1\1 Regular expression not represented in string format that has the same character repeated 3 times
“(.)(.)\2\1” #This will match two characters that appear once and then in reverse order
(..)\1 Regular expression not represented in string format that has two characters repeated twice in the same order
“(.).\1.\1” # This will match a character with the first character repeated in the 3rd and 5th positions with any characters permitted in the other positions except a newline
“(.)(.)(.).*\3\2\1” This pattern will match 3 characters that are following by zero or more characters and then have the pattern in reverse order

test_vec <- c('aaa','bbbght','abba','1221','yxyx','zrzrjlk','13171','abaza','123djakfl;jdafkjad321','abccba')
str_match(test_vec, "(.)\\1\\1") #1

##       [,1]  [,2]
##  [1,] "aaa" "a" 
##  [2,] "bbb" "b" 
##  [3,] NA    NA  
##  [4,] NA    NA  
##  [5,] NA    NA  
##  [6,] NA    NA  
##  [7,] NA    NA  
##  [8,] NA    NA  
##  [9,] NA    NA  
## [10,] NA    NA

str_match(test_vec,"(.)(.)\\2\\1") #2

##       [,1]   [,2] [,3]
##  [1,] NA     NA   NA  
##  [2,] NA     NA   NA  
##  [3,] "abba" "a"  "b" 
##  [4,] "1221" "1"  "2" 
##  [5,] NA     NA   NA  
##  [6,] NA     NA   NA  
##  [7,] NA     NA   NA  
##  [8,] NA     NA   NA  
##  [9,] NA     NA   NA  
## [10,] "bccb" "b"  "c"

str_match(test_vec,"(..)\\1") #3

##       [,1]   [,2]
##  [1,] NA     NA  
##  [2,] NA     NA  
##  [3,] NA     NA  
##  [4,] NA     NA  
##  [5,] "yxyx" "yx"
##  [6,] "zrzr" "zr"
##  [7,] NA     NA  
##  [8,] NA     NA  
##  [9,] NA     NA  
## [10,] NA     NA

str_match(test_vec,"(.).\\1.\\1") #4

##       [,1]    [,2]
##  [1,] NA      NA  
##  [2,] NA      NA  
##  [3,] NA      NA  
##  [4,] NA      NA  
##  [5,] NA      NA  
##  [6,] NA      NA  
##  [7,] "13171" "1" 
##  [8,] "abaza" "a" 
##  [9,] NA      NA  
## [10,] NA      NA

str_match(test_vec,"(.)(.)(.).*\\3\\2\\1") #5

##       [,1]                    [,2] [,3] [,4]
##  [1,] NA                      NA   NA   NA  
##  [2,] NA                      NA   NA   NA  
##  [3,] NA                      NA   NA   NA  
##  [4,] NA                      NA   NA   NA  
##  [5,] NA                      NA   NA   NA  
##  [6,] NA                      NA   NA   NA  
##  [7,] NA                      NA   NA   NA  
##  [8,] NA                      NA   NA   NA  
##  [9,] "123djakfl;jdafkjad321" "1"  "2"  "3" 
## [10,] "abccba"                "a"  "b"  "c"

Grouped the example strings into two different variations of string patterns to verify that the regex was working as understood and also modified the non-string expression to be compiled by R.

Apply Regex to string patterns

#4 Construct regular expressions to match words that:

reg_test <- c('abracadabra','racecar','noon ','zoo','churches','mississippi','missuszi','eleven','baaaseball')
#1. Start and end with the same character.
str_match(reg_test,'^(.).*\\1$')

##       [,1]          [,2]
##  [1,] "abracadabra" "a" 
##  [2,] "racecar"     "r" 
##  [3,] NA            NA  
##  [4,] NA            NA  
##  [5,] NA            NA  
##  [6,] NA            NA  
##  [7,] NA            NA  
##  [8,] NA            NA  
##  [9,] NA            NA

#2 Contain a repeated pair of letters (e.g. "church" contains "ch" repeated twice.)
str_match(reg_test,'(..).*\\1')

##       [,1]        [,2]
##  [1,] "abracadab" "ab"
##  [2,] NA          NA  
##  [3,] NA          NA  
##  [4,] NA          NA  
##  [5,] "church"    "ch"
##  [6,] "issis"     "is"
##  [7,] NA          NA  
##  [8,] NA          NA  
##  [9,] "baaaseba"  "ba"

#3 Contain one letter repeated in at least three places (e.g. "eleven" contains three "e"s.)
str_match(reg_test,'(.).*\\1.*\\1')

##       [,1]          [,2]
##  [1,] "abracadabra" "a" 
##  [2,] NA            NA  
##  [3,] NA            NA  
##  [4,] NA            NA  
##  [5,] NA            NA  
##  [6,] "ississippi"  "i" 
##  [7,] "ssus"        "s" 
##  [8,] "eleve"       "e" 
##  [9,] "aaaseba"     "a"

Included a few different examples that would satisfy the conditions specified for the homework