DATA607 WK3 Homework: Regex

Problem 3

Raw Data

library(stringr)
library(tidyverse)

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

#Extracts all letters except phone numbers
rawnames <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}")) 
rawnames

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

First and Last name

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.

#Switch first name with last name with back-referencing
name <- sub("(\\w+),\\s((\\w\\.)|(\\w+))","\\2 \\1", rawnames)

#to fix this name up a bit
name <- str_replace(name, "C. Burns Montgomery", "C. Montgomery Burns")
name

## [1] "Moe Szyslak"          "C. Montgomery Burns"  "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"

#? An alternative would be greatly appreciated in outputting the first initial
first_name <- unlist(str_extract_all(name, "(([a-zA-Z]+ )|([A-Z]\\............ )|( [A-Za-z]+$))"))
first_name

## [1] "Moe "           "C. Montgomery " "Timothy "       "Ned "          
## [5] "Homer "         "Julius "

#getting last names
last_name <- unlist(str_extract_all(name, "[^ ]+$"))
last_name

## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"

#concat first and last name
FLname <- str_c(first_name, last_name)
FLname

## [1] "Moe Szyslak"         "C. Montgomery Burns" "Timothy Lovejoy"    
## [4] "Ned Flanders"        "Homer Simpson"       "Julius Hibbert"

Detect Title

Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).

# Extract titles Rev. and Dr.
title <- unlist(str_extract_all(name, "\\w{2,}\\. "))

#checks to see if anyone has the title Rev. or Dr.
anytitle <- str_detect(name, title)
anytitle

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Detect Midname

Construct a logical vector indicating whether a character has a second name.

#Extract character with second name
midname <- unlist(str_extract_all(name, "[A-Z]\\. \\w+\\s")) 

str_detect(name, midname ) #checks for persons with a second name

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$

This expression matches a digit followed by one or more digits then a dollar ($) sign.

test <- c("12335", "45566", "$", "4666$", "463211", "vvvvv4$", "sksgmskmg")

str_extract_all(test, "[0-9]+\\$")

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "4666$"
## 
## [[5]]
## character(0)
## 
## [[6]]
## [1] "4$"
## 
## [[7]]
## character(0)

\b[a-z]{1,4}\b

Matches up to a four letter string. Alpha lowercase characters only.

library(stringi)

## Warning: package 'stringi' was built under R version 3.5.2

#randomly generates 5 strings with up to a length of 6
words <- tolower(stri_rand_strings(5, sample(1:6, 5, replace=TRUE)))
words

## [1] "c"     "h2"    "g6hls" "6tgu"  "8i5uy"

str_extract_all(words, "\\b[a-z]{1,4}\\b")

## [[1]]
## [1] "c"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)

.*?\.txt$

This expression matches to a string with .txt at the end.

sometext <- c("a.txt", "thisproject23.txt", "help.com")
sometext

## [1] "a.txt"             "thisproject23.txt" "help.com"

str_extract_all(sometext, ".*?\\.txt$")

## [[1]]
## [1] "a.txt"
## 
## [[2]]
## [1] "thisproject23.txt"
## 
## [[3]]
## character(0)

\d{2}/\d{2}/\d{4}

The forward slash (/) was the giveaway for this one.This expression is matched to a date data. Digits only with the forward slash format.

dates <- c("02/22/2019", "ab/cd/efhi", "1a-55-4789", "02-22-2019")
str_extract_all(dates, "\\d{2}/\\d{2}/\\d{4}")

## [[1]]
## [1] "02/22/2019"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

<(.+?)>.+?</\1>

This expression conforms to formats similar to html code. Figured this out by removing the quantifiers to understand the structure.

code <- c("<div> </div>", "<noscript>...</noscript>", "<div></noscript>")
code

## [1] "<div> </div>"             "<noscript>...</noscript>"
## [3] "<div></noscript>"

str_extract_all(code, "<(.+?)>.+?</\\1>")

## [[1]]
## [1] "<div> </div>"
## 
## [[2]]
## [1] "<noscript>...</noscript>"
## 
## [[3]]
## character(0)

Bonus: Problem 9

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

#Remove all letters except the uppercase ones
bonus <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

extract <- unlist(str_extract_all(bonus, "[A-Z\\W]+"))
extract #What the letters look like after extracted

##  [1] "C"  "O"  "N"  "G"  "R"  "A"  "T"  " U" "L"  "AT" "I"  "O"  "N"  "S" 
## [15] "."  "Y"  "O " "U"  "."  "A"  "R"  "E"  "."  "A"  ".S" "U"  "P"  "E" 
## [29] "R"  " "  "N"  "E"  "R"  "D"  "!"

result <- str_c(extract, collapse = "") #pastes the individual characters together
result

## [1] "CONGRAT ULATIONS.YO U.ARE.A.SUPER NERD!"

result <- str_remove(result, "\\s") #removes the first whitespace
result <- str_remove(result, "\\s") #...second whitespace
result <- gsub("\\.", " ", result) #replaces periods with space where neccessary. 

noquote(result) #Displays final product without quotes

## [1] CONGRATULATIONS YOU ARE A SUPER NERD!