library(stringr)
library(tidyverse)
raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data
## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555 -6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#Extracts all letters except phone numbers
rawnames <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
rawnames
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#Switch first name with last name with back-referencing
name <- sub("(\\w+),\\s((\\w\\.)|(\\w+))","\\2 \\1", rawnames)
#to fix this name up a bit
name <- str_replace(name, "C. Burns Montgomery", "C. Montgomery Burns")
name
## [1] "Moe Szyslak" "C. Montgomery Burns" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Dr. Julius Hibbert"
#? An alternative would be greatly appreciated in outputting the first initial
first_name <- unlist(str_extract_all(name, "(([a-zA-Z]+ )|([A-Z]\\............ )|( [A-Za-z]+$))"))
first_name
## [1] "Moe " "C. Montgomery " "Timothy " "Ned "
## [5] "Homer " "Julius "
#getting last names
last_name <- unlist(str_extract_all(name, "[^ ]+$"))
last_name
## [1] "Szyslak" "Burns" "Lovejoy" "Flanders" "Simpson" "Hibbert"
#concat first and last name
FLname <- str_c(first_name, last_name)
FLname
## [1] "Moe Szyslak" "C. Montgomery Burns" "Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" "Julius Hibbert"
# Extract titles Rev. and Dr.
title <- unlist(str_extract_all(name, "\\w{2,}\\. "))
#checks to see if anyone has the title Rev. or Dr.
anytitle <- str_detect(name, title)
anytitle
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
#Extract character with second name
midname <- unlist(str_extract_all(name, "[A-Z]\\. \\w+\\s"))
str_detect(name, midname ) #checks for persons with a second name
## [1] FALSE TRUE FALSE FALSE FALSE FALSE
test <- c("12335", "45566", "$", "4666$", "463211", "vvvvv4$", "sksgmskmg")
str_extract_all(test, "[0-9]+\\$")
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "4666$"
##
## [[5]]
## character(0)
##
## [[6]]
## [1] "4$"
##
## [[7]]
## character(0)
library(stringi)
## Warning: package 'stringi' was built under R version 3.5.2
#randomly generates 5 strings with up to a length of 6
words <- tolower(stri_rand_strings(5, sample(1:6, 5, replace=TRUE)))
words
## [1] "c" "h2" "g6hls" "6tgu" "8i5uy"
str_extract_all(words, "\\b[a-z]{1,4}\\b")
## [[1]]
## [1] "c"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
##
## [[5]]
## character(0)
sometext <- c("a.txt", "thisproject23.txt", "help.com")
sometext
## [1] "a.txt" "thisproject23.txt" "help.com"
str_extract_all(sometext, ".*?\\.txt$")
## [[1]]
## [1] "a.txt"
##
## [[2]]
## [1] "thisproject23.txt"
##
## [[3]]
## character(0)
dates <- c("02/22/2019", "ab/cd/efhi", "1a-55-4789", "02-22-2019")
str_extract_all(dates, "\\d{2}/\\d{2}/\\d{4}")
## [[1]]
## [1] "02/22/2019"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
code <- c("<div> </div>", "<noscript>...</noscript>", "<div></noscript>")
code
## [1] "<div> </div>" "<noscript>...</noscript>"
## [3] "<div></noscript>"
str_extract_all(code, "<(.+?)>.+?</\\1>")
## [[1]]
## [1] "<div> </div>"
##
## [[2]]
## [1] "<noscript>...</noscript>"
##
## [[3]]
## character(0)
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.
clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
#Remove all letters except the uppercase ones
bonus <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
extract <- unlist(str_extract_all(bonus, "[A-Z\\W]+"))
extract #What the letters look like after extracted
## [1] "C" "O" "N" "G" "R" "A" "T" " U" "L" "AT" "I" "O" "N" "S"
## [15] "." "Y" "O " "U" "." "A" "R" "E" "." "A" ".S" "U" "P" "E"
## [29] "R" " " "N" "E" "R" "D" "!"
result <- str_c(extract, collapse = "") #pastes the individual characters together
result
## [1] "CONGRAT ULATIONS.YO U.ARE.A.SUPER NERD!"
result <- str_remove(result, "\\s") #removes the first whitespace
result <- str_remove(result, "\\s") #...second whitespace
result <- gsub("\\.", " ", result) #replaces periods with space where neccessary.
noquote(result) #Displays final product without quotes
## [1] CONGRATULATIONS YOU ARE A SUPER NERD!