Week3-regex

#1. Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
library(stringr)

raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
raw.data

## [1] "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"

flname <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
flname

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

for(i in 1:length(flname)) {
  if(str_detect(flname[i],",")==TRUE) {
    buf<-unlist(str_split(flname[i],","))
    flname[i]<-paste(buf[2], buf[1], sep=" ")
  }
}

data.frame(flname)

##                 flname
## 1          Moe Szyslak
## 2  C. Montgomery Burns
## 3 Rev. Timothy Lovejoy
## 4         Ned Flanders
## 5        Homer Simpson
## 6   Dr. Julius Hibbert

# remove initials
flnamewt <- sub(" [A-z]{1}\\. "," ",flname)  
data.frame(flnamewt)

##               flnamewt
## 1          Moe Szyslak
## 2     Montgomery Burns
## 3 Rev. Timothy Lovejoy
## 4         Ned Flanders
## 5        Homer Simpson
## 6   Dr. Julius Hibbert

#I was not 100% if the requirement was to remove the title as well so created 2 vectors one with title and one without 

#Names with Title
data.frame(flnamewt)

##               flnamewt
## 1          Moe Szyslak
## 2     Montgomery Burns
## 3 Rev. Timothy Lovejoy
## 4         Ned Flanders
## 5        Homer Simpson
## 6   Dr. Julius Hibbert

#Remove Titles
flname <- sub("[A-z]{2,3}\\. ","",flname)  
#Names without Title
data.frame(flname)

##                 flname
## 1          Moe Szyslak
## 2  C. Montgomery Burns
## 3      Timothy Lovejoy
## 4         Ned Flanders
## 5        Homer Simpson
## 6       Julius Hibbert

#2. Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.)

title_v <- str_detect(flnamewt,"[A-z]{2,3}\\. ")
dft <- data.frame(flnamewt,title_v)
dft

##               flnamewt title_v
## 1          Moe Szyslak   FALSE
## 2     Montgomery Burns   FALSE
## 3 Rev. Timothy Lovejoy    TRUE
## 4         Ned Flanders   FALSE
## 5        Homer Simpson   FALSE
## 6   Dr. Julius Hibbert    TRUE

#3. Construct a logical vector indicating whether a character has a second name.
secnm_v <- str_detect(flname," [A-z]{1}\\. ")
dfs <- data.frame(flname,secnm_v)
dfs

##                 flname secnm_v
## 1          Moe Szyslak   FALSE
## 2  C. Montgomery Burns    TRUE
## 3      Timothy Lovejoy   FALSE
## 4         Ned Flanders   FALSE
## 5        Homer Simpson   FALSE
## 6       Julius Hibbert   FALSE

#4. Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

#4.1 [0-9]+\\$ #This expression is true for string of one or more digits followed by dollar sign.
pattern="[0-9]+\\$"
sample=c("123$23","abc123$","XYZ$")
str_detect(sample,pattern)

## [1]  TRUE  TRUE FALSE

#4.2 \\b[a-z]{1,4}\\b #This expression is true for string of one to four lower case letters.
pattern="\\b[a-z]{1,4}\\b"
sample=c("ABC2","ja24","cuny")
str_detect(sample,pattern)

## [1] FALSE FALSE  TRUE

#4.3 .*?\\.txt$ #This expression is true for a string ending in “.txt”.
pattern=".*?\\.txt$"
sample=c(".txt","Test.dat","123.xml","a$b#1.txt")
str_detect(sample,pattern)

## [1]  TRUE FALSE FALSE  TRUE

#4.4 \\d{2}/\\d{2}/\\d{4} #This expression is true for a string that contains 2 digits with a slash then 2 digits and a slash and followed by 4 digits like a date format
pattern="\\d{2}/\\d{2}/\\d{4}"
sample=c("02/16/2018","02/16/20")
str_detect(sample,pattern)

## [1]  TRUE FALSE

#4.5 <(.+?)>.+?</\\1> #This expression is true for a string that has an opening and closing brackets at the beginning, take any string in the middle, then opening and closing brackets with a forward slash like an xml tag
pattern="<(.+?)>.+?</\\1>"
sample=c("<tag>Text</tag>","<tag>123<tag>")
str_detect(sample,pattern)

## [1]  TRUE FALSE

#5. The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

code <-
  "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

codex <- unlist(str_extract_all(code, "[[:upper:].]"))
codex

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D"

concat <- str_replace_all(paste(codex, collapse = ''), "[.]", " ")
concat

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"

Week3-regex

Ashish Kumar

2/15/2018