#install.packages('stringr')
R> name [1] “Moe Szyslak” “Burns, C. Montgomery” “Rev. Timothy Lovejoy” [4] “Ned Flanders” “Simpson, Homer” “Dr. Julius Hibbert”
library('stringr')
library('tidyr')
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
#[:alpha:]: alpjabetic characters: a-z and A-Z
#{n,} The preceding item is matched n or more times
name<-unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
#split each string and store into a 4 column table t
t<-(str_split_fixed(name, "[[:blank:]]",4))
t
## [,1] [,2] [,3] [,4]
## [1,] "Moe" "Szyslak" "" ""
## [2,] "Burns," "C." "Montgomery" ""
## [3,] "Rev." "Timothy" "Lovejoy" ""
## [4,] "Ned" "Flanders" "" ""
## [5,] "Simpson," "Homer" "" ""
## [6,] "Dr." "Julius" "Hibbert" ""
# rename columnname
colnames(t)<-c("firstname","lastname","secondname", "title")
#grep string in first column which has ',', and do swap firstname and lastname value.
t[grep(",",t[,1]), c("lastname","firstname")] <- t[grep(",",t[,1]), c("firstname","lastname")]
t
## firstname lastname secondname title
## [1,] "Moe" "Szyslak" "" ""
## [2,] "C." "Burns," "Montgomery" ""
## [3,] "Rev." "Timothy" "Lovejoy" ""
## [4,] "Ned" "Flanders" "" ""
## [5,] "Homer" "Simpson," "" ""
## [6,] "Dr." "Julius" "Hibbert" ""
#grep string in first column which has '.' and match 2 letter, and place that value to title; also replace lastname and secondname with firstname and lastname; secondname becomes empty.
t[grep(".{2,}\\.",t[,1]), c("title","lastname","firstname","secondname")] <- t[grep(".{2,}\\.",t[,1]), c("firstname","secondname","lastname","title")]
t
## firstname lastname secondname title
## [1,] "Moe" "Szyslak" "" ""
## [2,] "C." "Burns," "Montgomery" ""
## [3,] "Timothy" "Lovejoy" "" "Rev."
## [4,] "Ned" "Flanders" "" ""
## [5,] "Homer" "Simpson," "" ""
## [6,] "Julius" "Hibbert" "" "Dr."
#If the strng has ",", replace a varch "," with "" in string
t[,2]<-str_replace(t[,2],pattern=",",replacement="")
show(t)
## firstname lastname secondname title
## [1,] "Moe" "Szyslak" "" ""
## [2,] "C." "Burns" "Montgomery" ""
## [3,] "Timothy" "Lovejoy" "" "Rev."
## [4,] "Ned" "Flanders" "" ""
## [5,] "Homer" "Simpson" "" ""
## [6,] "Julius" "Hibbert" "" "Dr."
#select non empty string from the title column
titleVec<-subset(t,t[,4]!="")
show(titleVec)
## firstname lastname secondname title
## [1,] "Timothy" "Lovejoy" "" "Rev."
## [2,] "Julius" "Hibbert" "" "Dr."
secondNameVec<-subset(t,t[,3]!="")
show(secondNameVec)
## firstname lastname secondname title
## [1,] "C." "Burns" "Montgomery" ""
Ans: first One or more digits with the end with $ sign
test<-c("m578$/search?fr\\445kk55$=mcaf")
str_extract(test, "[0-9]+\\$")
## [1] "578$"
Ans: word boundary one or max 4 lower case alphabet[a,z] before end of word boundary
test<-c("abch")
str_extract(test, "\\b[a-z]{1,4}\\b")
## [1] "abch"
Ans: .* means to match 0 or more least number of characters ending with .txt
test<-c(".....dsepple.txt")
str_extract(test, ".*?\\.txt$")
## [1] ".....dsepple.txt"
Ans: two digit / 2 digits / 4 digits.
test<-c("01/10/2017")
str_extract(test, "\\d{2}/\\d{2}/\\d{4}")
## [1] "01/10/2017"
Ans:
test<-c("<html>dr35</html>")
str_extract(test, "<(.+?)>.+?</\\1>")
## [1] "<html>dr35</html>"
Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com. clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
raw.data<-"clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo
Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO
d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5
fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
s<-unlist(str_extract_all(raw.data, "[:upper:]"))
s
## [1] "C" "O" "N" "G" "R" "A" "T" "U" "L" "A" "T" "I" "O" "N" "S" "Y" "O"
## [18] "U" "A" "R" "E" "A" "S" "U" "P" "E" "R" "N" "E" "R" "D"