d607_assignment3_JagdishChhabria

Copy the introductory example. The vector name stores the extracted names. R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”

Use the tools of this chapter to rearrange the vector so that all elements conform to the standard first_name last_name.
Construct a logical vector indicating whether a character has a title (i.e., Rev. and Dr.).
Construct a logical vector indicating whether a character has a second name.

library(stringr)
library(XML)
library(RCurl)

## Loading required package: bitops

library(tau)

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5543642Dr. Julius Hibbert"

# Extract information
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

phone <- unlist(str_extract_all(raw.data, "\\(?(\\d{3})?\\)?(-| )?\\d{3}(-| )?\\d{4}"))
phone

## [1] "555-1239"       "(636) 555-0113" "555-6542"       "555 8904"      
## [5] "636-555-3226"   "5543642"

data.frame(name = name, phone = phone)

names<-as.character(name)
names.title<-NULL
names.second<-NULL
i=1
#length(names)
for(i in seq_along(names)) {
names1<-names[i]
#names1 = "Burns, C. Montgomery"
# Check if name has a comma
if(is.na(str_locate(pattern=',', names1))[1]) {
  # If not, Check if name has a period
  if(is.na(str_locate(pattern='\\.', names1))[1]) {
    # Name has neither comma nor period. Use as-is
    # print(names1)
    names[i]<-names1
    names.title[i]<-FALSE
    names.second[i]<-FALSE
  } else {
  # Name has no comma, but only period i.e. a title. Remove the portion upto the period and use the rest
  names1=gsub(substr(names1, 1, str_locate(pattern='\\.', names1)+1),"",names1)
  #print(names1)
  names[i]<-names1
  names.title[i]<-TRUE
  names.second[i]<-FALSE
  }} else {
    # Name has comma. May or may not have a period
    # split into first name and last name
    lastname=as.character(substr(names1, 1, str_locate(pattern=',', names1)-1))
    firstname=as.character(substr(names1, str_locate(pattern=',', names1)+2, stop=nchar(names1)))
    # Check if firstname has a period
       if(is.na(str_locate(pattern='\\.', firstname))[1]) {
         # first name does not have a period. Combine first name and last name in the required order
         names1<-paste(firstname,lastname)
         #print(names1)
         names[i]<-names1
         names.title[i]<-FALSE
         names.second[i]<-FALSE
       } else {
         # Name has both comma and period. Remove the period from the first name
         firstname=as.character(substr(firstname, str_locate(pattern='\\.', firstname)+2,stop=nchar(firstname)))
       
         names1<-paste(firstname,lastname)
         #print(names1)
         names[i]<-names1
         names.title[i]<-FALSE
         names.second[i]<-TRUE
             }
  }
}
names

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

cat("The logical vector indicating prescence of a title is:", names.title, "\n")

## The logical vector indicating prescence of a title is: FALSE FALSE TRUE FALSE FALSE TRUE

cat("The logical vector indicating prescence of a second name is:", names.second, "\n")

## The logical vector indicating prescence of a second name is: FALSE TRUE FALSE FALSE FALSE FALSE

# (firstname=substr(firstname, str_locate(pattern='\\.', firstname)+2,stop=nchar(firstname)))
#(lastname=substr(names1, 1, str_locate(pattern=',', names1)-1))
#str_locate(pattern='\\.', "Montgomery")
#str_extract("Burns, C. Montgomery",pattern=',')
#str_locate(pattern='\.', "Moe Szysak")
#str_detect(pattern='\.', "Moe Szysak")
#str_extract("Moe Szyslak", "[[:punct:]]")
#str_extract_all("Moe Szyslak", "[[:punct:]]")
#str_split(names, "[[:punct:]]")
#grep(".",names1)
#nchar("Moe Szyslak")

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$ A string containing one or more numbers ranging between 0 and 9 followed by a $ sign i.e. an amount in dollars

input<-c("8789$", "what", "abc22$", 56, "$")
unlist(str_extract_all(input,"[0-9]+\\$"))

## [1] "8789$" "22$"

\b[a-z]{1,4}\b A string containing at least 1 but not more than 4 characters between ‘a’ and ‘z’ at the beginning and end of the word

input<-c("8789$", "what", "abc22$", 56, "$", "awesome", "34..awa")
unlist(str_extract_all(input,"\\b[a-z]{1,4}\\b"))

## [1] "what" "awa"

.*?\.txt$ A string containing either 0 or 1 characters of any type followed by a period followed by the sequence “txt” at the end i.e. a file-name ending with the “txt” file extension

input<-c("8789$", "what", "abc22$", 56, "$", "awesome", "34..awa", ".txt", "23.txt")
unlist(str_extract_all(input,".*?\\.txt$"))

## [1] ".txt"   "23.txt"

\d{2}/\d{2}/\d{4}

A string containing 2 digits followed by a slash followed by 2 digits followed by a slash followed by 4 digits. This is the standard date format dd/mm/yyyy or mm/dd/yyyy.

input<-"02/17/2019"
str_extract_all(input,"\\d{2}/\\d{2}/\\d{4}")

## [[1]]
## [1] "02/17/2019"

<(.+?)>.+?</\1>

A string containing 0 or 1 character at the beginning followed by 0 or 1 character followed by a slash and then the character ‘1’ at the end

input<-c("8789$", "what", "abc22$", 56, "$", "awesome", "34..awa", ".txt", "23.txt", "/1", "ab/1", "\\")
unlist(str_extract_all(input,"<(.+?)>.+?</\\1>"))

## character(0)

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

string<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"

str_extract_all(string, pattern = "[A-Z]")

## [[1]]
##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"

CONGRATULATIONS YOU ARE A SUPER NERD

d607_assignment3_JagdishChhabria

Jagdish Chhabria

February 16, 2019