library('stringr')
## Warning: package 'stringr' was built under R version 3.3.3
library('tidyr')
## Warning: package 'tidyr' was built under R version 3.3.3
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson,Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" "Dr. Julius Hibbert"
phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
# removes the prefix from anyone who has one
pop.prefix <- str_trim(sub("[[:alpha:]]{1,}\\.", "" ,name))
pop.prefix
## [1] "Moe Szyslak" "Burns, Montgomery" "Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson,Homer" "Julius Hibbert"
# To switch the names
fix.homer <- sub('^(.*),Homer', 'Homer \\1', pop.prefix)
fix.burns <- sub('^(.*), Montgomery', 'Montgomery \\1', fix.homer)
clean.names <- fix.burns
df2 <- data.frame(Name=clean.names, Phone=phone)
df2
## Name Phone
## 1 Moe Szyslak 555-1239
## 2 Montgomery Burns (636) 555-0113
## 3 Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Homer Simpson 636-555-3226
## 6 Julius Hibbert 5553642
# Splits the name column to first, last name
extract(df2, Name, into=c('FirstName', 'LastName'), '(.*)\\s+([^ ]+)$')
## FirstName LastName Phone
## 1 Moe Szyslak 555-1239
## 2 Montgomery Burns (636) 555-0113
## 3 Timothy Lovejoy 555-6542
## 4 Ned Flanders 555 8904
## 5 Homer Simpson 636-555-3226
## 6 Julius Hibbert 5553642
prefix <- str_detect(name,"[A-Za-z]{2,}\\.")
prefix
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
#Applying same regex as before but including the "C." in Montgomery Burns
fix1 <- sub('^(.*),Homer', 'Homer \\1', name)
fix2 <- sub('^(.*), C. Montgomery', 'C. Montgomery \\1', fix1)
data.frame(Name=fix2, Title=prefix)
## Name Title
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns FALSE
## 3 Rev. Timothy Lovejoy TRUE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Dr. Julius Hibbert TRUE
# I changed the params to only pop "Rev." and "Dr."
pop.prefix2 <- str_trim(sub("[[:alpha:]]{2,}\\.", "" ,fix2))
find_middle <- str_detect(pop.prefix2,"\\s[A-Za-z]{2,}\\ ")
df <- data.frame(Name=pop.prefix2, Middle_Name=find_middle)
df
## Name Middle_Name
## 1 Moe Szyslak FALSE
## 2 C. Montgomery Burns TRUE
## 3 Timothy Lovejoy FALSE
## 4 Ned Flanders FALSE
## 5 Homer Simpson FALSE
## 6 Julius Hibbert FALSE
Describe the types of strings that conform to the following regular expressions and construst an example that is matched by the regular expression. a. [0-9]+$
# Searches for 1 or more strings that contain any number of digits followed by a $
test <- c("asl1$kfda$2309$laksjf$al;kjafd39444$")
str_extract_all(test,"[0-9]+\\$")
## [[1]]
## [1] "1$" "2309$" "39444$"
# Searches for lower case letters between lengths 1-4 and in beween spaces
test <- c("23h2a lets w9sja2 get ao38s an 23oas a")
str_extract_all(test, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "lets" "get" "an" "a"
# Searches for any charachter for any length for 0 or 1 rep that precedes a ".txt""
test <- c("test.pdf", "test.jpg", "test.txt")
str_extract_all(test, ".*?\\.txt$")
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "test.txt"
# Searches for 2/2/4 digits like MM/DD/YYYY
test <- c("2/26/92", "02/26/92", "02/26/1492")
str_extract_all(test, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## [1] "02/26/1492"
# This is looking for html tags and any length of any characher between them.
# The \1 is back referencing the set of regex before the ?, which denotes end of string
test <- c("<span>homeruns</span>", "<class>Hello!</class>", "<where300/?")
str_extract_all(test, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<span>homeruns</span>"
##
## [[2]]
## [1] "<class>Hello!</class>"
##
## [[3]]
## character(0)
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
jumble <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk! gr"
caps <- unlist(str_extract_all(jumble, "[[:upper:]]"))
str_c(caps, collapse = "")
## [1] "CONGRATULATIONSYOUAREASUPERNERD"