Problem 3

Raw Data

library(stringr)
library(tidyverse)

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#Extracts all letters except phone numbers
rawnames <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}")) 
rawnames
## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

First and Last name

  1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
#Switch first name with last name with back-referencing
name <- sub("(\\w+),\\s((\\w\\.)|(\\w+))","\\2 \\1", rawnames)

#to fix this name up a bit
name <- str_replace(name, "C. Burns Montgomery", "C. Montgomery Burns")
name
## [1] "Moe Szyslak"          "C. Montgomery Burns"  "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Homer Simpson"        "Dr. Julius Hibbert"
#? An alternative would be greatly appreciated in outputting the first initial
first_name <- unlist(str_extract_all(name, "(([a-zA-Z]+ )|([A-Z]\\............ )|( [A-Za-z]+$))"))
first_name
## [1] "Moe "           "C. Montgomery " "Timothy "       "Ned "          
## [5] "Homer "         "Julius "
#getting last names
last_name <- unlist(str_extract_all(name, "[^ ]+$"))
last_name
## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"
#concat first and last name
FLname <- str_c(first_name, last_name)
FLname
## [1] "Moe Szyslak"         "C. Montgomery Burns" "Timothy Lovejoy"    
## [4] "Ned Flanders"        "Homer Simpson"       "Julius Hibbert"

Detect Title

  1. Construct a logical vector indicating whether a character has a title (i.e., Rev.ย and Dr.).
# Extract titles Rev. and Dr.
title <- unlist(str_extract_all(name, "\\w{2,}\\. "))

#checks to see if anyone has the title Rev. or Dr.
anytitle <- str_detect(name, title)
anytitle
## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

Detect Midname

  1. Construct a logical vector indicating whether a character has a second name.
#Extract character with second name
midname <- unlist(str_extract_all(name, "[A-Z]\\. \\w+\\s")) 

str_detect(name, midname ) #checks for persons with a second name
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

Problem 4

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$

  • This expression matches a digit followed by one or more digits then a dollar ($) sign.
test <- c("12335", "45566", "$", "4666$", "463211", "vvvvv4$", "sksgmskmg")

str_extract_all(test, "[0-9]+\\$")
## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "4666$"
## 
## [[5]]
## character(0)
## 
## [[6]]
## [1] "4$"
## 
## [[7]]
## character(0)

\b[a-z]{1,4}\b

  • Matches up to a four letter string. Alpha lowercase characters only.
library(stringi)
## Warning: package 'stringi' was built under R version 3.5.2
#randomly generates 5 strings with up to a length of 6
words <- tolower(stri_rand_strings(5, sample(1:6, 5, replace=TRUE)))
words
## [1] "c"     "h2"    "g6hls" "6tgu"  "8i5uy"
str_extract_all(words, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "c"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)
## 
## [[5]]
## character(0)

.*?\.txt$

  • This expression matches to a string with .txt at the end.
sometext <- c("a.txt", "thisproject23.txt", "help.com")
sometext
## [1] "a.txt"             "thisproject23.txt" "help.com"
str_extract_all(sometext, ".*?\\.txt$")
## [[1]]
## [1] "a.txt"
## 
## [[2]]
## [1] "thisproject23.txt"
## 
## [[3]]
## character(0)

\d{2}/\d{2}/\d{4}

  • The forward slash (/) was the giveaway for this one.This expression is matched to a date data. Digits only with the forward slash format.
dates <- c("02/22/2019", "ab/cd/efhi", "1a-55-4789", "02-22-2019")
str_extract_all(dates, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/22/2019"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

<(.+?)>.+?</\1>

  • This expression conforms to formats similar to html code. Figured this out by removing the quantifiers to understand the structure.
code <- c("<div> </div>", "<noscript>...</noscript>", "<div></noscript>")
code
## [1] "<div> </div>"             "<noscript>...</noscript>"
## [3] "<div></noscript>"
str_extract_all(code, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<div> </div>"
## 
## [[2]]
## [1] "<noscript>...</noscript>"
## 
## [[3]]
## character(0)

Bonus: Problem 9

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

#Remove all letters except the uppercase ones
bonus <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

extract <- unlist(str_extract_all(bonus, "[A-Z\\W]+"))
extract #What the letters look like after extracted
##  [1] "C"  "O"  "N"  "G"  "R"  "A"  "T"  " U" "L"  "AT" "I"  "O"  "N"  "S" 
## [15] "."  "Y"  "O " "U"  "."  "A"  "R"  "E"  "."  "A"  ".S" "U"  "P"  "E" 
## [29] "R"  " "  "N"  "E"  "R"  "D"  "!"
result <- str_c(extract, collapse = "") #pastes the individual characters together
result 
## [1] "CONGRAT ULATIONS.YO U.ARE.A.SUPER NERD!"
result <- str_remove(result, "\\s") #removes the first whitespace
result <- str_remove(result, "\\s") #...second whitespace
result <- gsub("\\.", " ", result) #replaces periods with space where neccessary. 

noquote(result) #Displays final product without quotes
## [1] CONGRATULATIONS YOU ARE A SUPER NERD!