Untitled

3.Copy the introductory example. The vector name stores the extracted names.

library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))

Use the tools of this chapter to rearrange the vector so that the element conform to the standard first_name last_name.

FL_names <- str_trim(sub("\\s\\w\\.|\\S\\w+\\.\\s","", name))
FL_names

## [1] "Moe Szyslak"       "Burns, Montgomery" "Timothy Lovejoy"  
## [4] "Ned Flanders"      "Simpson, Homer"    "Julius Hibbert"

FL_names1<-sub("(\\w+),\\s+(\\w+)","\\2 \\1", FL_names)
FL_names1

## [1] "Moe Szyslak"      "Montgomery Burns" "Timothy Lovejoy" 
## [4] "Ned Flanders"     "Homer Simpson"    "Julius Hibbert"

Construct a logical vector indicating whether a character has a title(ie., Rev. and Dr.)

titled_name<-str_detect(name, "Rev.|Dr.")
titled_name

## [1] FALSE FALSE  TRUE FALSE FALSE  TRUE

cbind(name, titled_name)

##      name                   titled_name
## [1,] "Moe Szyslak"          "FALSE"    
## [2,] "Burns, C. Montgomery" "FALSE"    
## [3,] "Rev. Timothy Lovejoy" "TRUE"     
## [4,] "Ned Flanders"         "FALSE"    
## [5,] "Simpson, Homer"       "FALSE"    
## [6,] "Dr. Julius Hibbert"   "TRUE"

3.Construct a logical vector indicating whether a character has a second name.

second_name <- str_detect(name, "[A-Z]\\.{1}")
second_name

## [1] FALSE  TRUE FALSE FALSE FALSE FALSE

cbind(name, second_name)

##      name                   second_name
## [1,] "Moe Szyslak"          "FALSE"    
## [2,] "Burns, C. Montgomery" "TRUE"     
## [3,] "Rev. Timothy Lovejoy" "FALSE"    
## [4,] "Ned Flanders"         "FALSE"    
## [5,] "Simpson, Homer"       "FALSE"    
## [6,] "Dr. Julius Hibbert"   "FALSE"

Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.

[0-9]+\$

test<-c("12308129038asdas09$", "hello#", "233124$1", "1$")
number_vec<-unlist(str_extract_all(test, "[0-9]+\\$"))
number_vec

## [1] "09$"     "233124$" "1$"

Matches string patterns that have 0-9 digits one or more times followed by the $ sign.

test1<-"badb, abcedf, eheheh, badasdasdasd, abcd"
number_vec1<-unlist(str_extract_all(test1, "\\b[a-z]{1,4}\\b"))
number_vec1

## [1] "badb" "abcd"

Matches words that have one and four lowercase characters only.

c).*?\.txt$

test2<-c("homework.gif", "homework.jpg", "homework2.txt")
number_vec2<-unlist(str_extract_all(test2, ".*?\\.txt$"))
number_vec2

## [1] "homework2.txt"

Matches strings that end with .txt d) \d{2}/\d{2}/\d{4}

test3<-c("07/17/1990", "ab/12/a123", "0909111")
number_vec3<-unlist(str_extract_all(test3, "\\d{2}/\\d{2}/\\d{4}"))
number_vec3

## [1] "07/17/1990"

Matches string with a format of XX/XX/XXXX, X can only be digits. e) <(.+?)>.+?</\1>

test4<-c("<a>HEllo", "<a> Hello </a>", "<b>My name is</d>", "<html>Hi there</html>")
number_vec4<-unlist(str_extract_all(test4, "<(.+?)>.+?</\\1>"))
number_vec4

## [1] "<a> Hello </a>"        "<html>Hi there</html>"

Matches the start tag, then characters in between and then end tag. The start tag and end tag must be the same letter.

The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com.

clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr

code<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
crack_code<-unlist(str_extract_all(code, "[[A-Z][:punct:]]"))
crack_code

##  [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "." "Y"
## [18] "O" "U" "." "A" "R" "E" "." "A" "." "S" "U" "P" "E" "R" "N" "E" "R"
## [35] "D" "!"

Untitled

Tony Mei

9/10/2019