required Library

library(stringr)

3. Copy the introductory example. The vector name stores the extracted names.

Load Data

raw.data <- "555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name

## [1] "Moe Szyslak"          "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders"         "Simpson, Homer"       "Dr. Julius Hibbert"

(a)Use the tools of this chapter to rearrnge the vector so that all elements conform to the standard first_name last_name’

Remove Title

noTitle <- unlist(str_replace(name,"[[:alpha:]]{2,}[.]",""))
noTitle

## [1] "Moe Szyslak"          "Burns, C. Montgomery" " Timothy Lovejoy"    
## [4] "Ned Flanders"         "Simpson, Homer"       " Julius Hibbert"

Remove Second Name

noSecName <- unlist(str_replace(noTitle,"[[:alpha:]]{1}[.]",""))
noSecName

## [1] "Moe Szyslak"        "Burns,  Montgomery" " Timothy Lovejoy"  
## [4] "Ned Flanders"       "Simpson, Homer"     " Julius Hibbert"

Extracting First Name

firstName <- unlist(str_extract(noSecName,"[[:alpha:]]{2,}[[:space:]]{1,}|[[:punct:]][[:space:]]{1,}[[:alpha:]]{2,}"))
firstName <- unlist(str_replace(firstName,"[[:punct:]][[:space:]]",""))
firstName

## [1] "Moe "        " Montgomery" "Timothy "    "Ned "        "Homer"      
## [6] "Julius "

Extracting Last Name

lastName <- unlist(str_extract(noSecName,"[^[:punct:]][[:space:]][[:alpha:]]{2,}|[[:alpha:]]{2,}[[:punct:]]"))
lastName <- unlist(str_replace(lastName,"[[:alpha:]][[:space:]]", ""))
lastName <- unlist(str_replace(lastName,"[[:punct:]]", ""))
lastName

## [1] "Szyslak"  "Burns"    "Lovejoy"  "Flanders" "Simpson"  "Hibbert"

For first names and last names I was having trobule with mixing characters and punctuations, used str_replace function to clean up the data.

Rearrange

paste(firstName, lastName)

## [1] "Moe  Szyslak"      " Montgomery Burns" "Timothy  Lovejoy" 
## [4] "Ned  Flanders"     "Homer Simpson"     "Julius  Hibbert"

data.frame(firstName, lastName)

##     firstName lastName
## 1        Moe   Szyslak
## 2  Montgomery    Burns
## 3    Timothy   Lovejoy
## 4        Ned  Flanders
## 5       Homer  Simpson
## 6     Julius   Hibbert

(b)Construct a logical vector indicating whether a character has a title(i.e., Rev. and Dr.).

Title has more than 2 alphabet and ends with“.”

title <- unlist(str_detect(name,"[[:alpha:]]{2,}[.]"))
title

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

df1 <- data.frame(name,title)
df1

##                   name title
## 1          Moe Szyslak FALSE
## 2 Burns, C. Montgomery FALSE
## 3 Rev. Timothy Lovejoy  TRUE
## 4         Ned Flanders FALSE
## 5       Simpson, Homer FALSE
## 6   Dr. Julius Hibbert  TRUE

(c)Construct a logical vector indicating whether a character has a second name.

Second Name

secName <- unlist(str_detect(noTitle,"[[:alpha:]]{1,}[.]"))
secName

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

df2 <- data.frame(name,secName)
df2

##                   name secName
## 1          Moe Szyslak   FALSE
## 2 Burns, C. Montgomery    TRUE
## 3 Rev. Timothy Lovejoy   FALSE
## 4         Ned Flanders   FALSE
## 5       Simpson, Homer   FALSE
## 6   Dr. Julius Hibbert   FALSE

Describe the types of strings that conform to the following regular expressions and construct and example that is matched by regular expression.

(a)[0-9]+\$

The above regular expression looking for numbers 0 to and ends with $.

sol1 <- c("124$", "$124", "a2b$", "16$cb")
sol1 <- unlist(str_detect(sol1, "[0-9]+\\$"))
sol1

## [1]  TRUE FALSE FALSE  TRUE

(b)\b[a-z]{1,4}\b

This regular expression will extract data containing blank followed by 1 to four lower case letters followed by blank.

sol2 <- c("abcdf", "ghij", "KLMN", "6542", "opqr 23")
sol2 <- unlist(str_extract(sol2, "\\b[a-z]{1,4}\\b"))
sol2

## [1] NA     "ghij" NA     NA     "opqr"

(c).*?\.txt$ ##This regular expression will look for files ends in .txt

sol3 <- c("ahs.rtxt", "123asd.txt", "txt.r")
sol3 <- unlist(str_extract(sol3, ".*?\\.txt$"))
sol3

## [1] NA           "123asd.txt" NA

(d)\d{2}/\d{2}\d{4}

This regular expression is looking for two numbers followed by a forward slash followed by two numbers followed by a forward slash followed by 4 numbers, most likely a date format.

sol4 <- c("09/24/1979", "mm/dd/yyyy", "23/sept/1979")
sol4 <- unlist(str_extract(sol4, "\\d{2}/\\d{2}/\\d{4}"))
sol4

## [1] "09/24/1979" NA           NA

(e)<(.+?)>.+?</\1>

This regular expression will look for html tags

sol5 <- c("<tag>cuny msda</tag>", "<tag>Assingnment2<tag>", "www.cuny.com")
sol5 <- unlist(str_extract(sol5, "<(.+?)>.+?</\\1>"))
sol5

## [1] "<tag>cuny msda</tag>" NA                     NA

9.The following code hides a secret message. Crack it with R and regular expressions.

Load Hidden message

hMessage <- c("clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr")

Upper cases are more revealing than the others and “.” used as word sepatators

Extract characters

message <- unlist(str_extract_all(hMessage, "[[:upper:].]{1,}"))
message

##  [1] "C"  "O"  "N"  "G"  "R"  "A"  "T"  "U"  "L"  "AT" "I"  "O"  "N"  "S" 
## [15] "."  "Y"  "O"  "U"  "."  "A"  "R"  "E"  "."  "A"  ".S" "U"  "P"  "E" 
## [29] "R"  "N"  "E"  "R"  "D"

Join and Replace “.” with space

message <- str_replace_all(paste(message, collapse = ''), "[.]", " ")
message

## [1] "CONGRATULATIONS YOU ARE A SUPERNERD"

607 Week3

Ahsanul Choudhury

September 18, 2016

required Library

3. Copy the introductory example. The vector name stores the extracted names.

Load Data

(a)Use the tools of this chapter to rearrnge the vector so that all elements conform to the standard first_name last_name’

Remove Title

Remove Second Name

Extracting First Name

Extracting Last Name

For first names and last names I was having trobule with mixing characters and punctuations, used str_replace function to clean up the data.

Rearrange

(b)Construct a logical vector indicating whether a character has a title(i.e., Rev. and Dr.).

Title has more than 2 alphabet and ends with“.”

(c)Construct a logical vector indicating whether a character has a second name.

Second Name

Describe the types of strings that conform to the following regular expressions and construct and example that is matched by regular expression.

(a)[0-9]+\$

The above regular expression looking for numbers 0 to and ends with $.

(b)\b[a-z]{1,4}\b

This regular expression will extract data containing blank followed by 1 to four lower case letters followed by blank.

(c).*?\.txt$ ##This regular expression will look for files ends in .txt

(d)\d{2}/\d{2}\d{4}

This regular expression is looking for two numbers followed by a forward slash followed by two numbers followed by a forward slash followed by 4 numbers, most likely a date format.

(e)<(.+?)>.+?</\1>

This regular expression will look for html tags

9.The following code hides a secret message. Crack it with R and regular expressions.

Load Hidden message

Upper cases are more revealing than the others and “.” used as word sepatators

Extract characters

Join and Replace “.” with space