library(stringr)
raw.data <-"555-1239Moe Szyslak(636) 555-0113Burns, C. Montgomery555-6542Rev. Timothy Lovejoy555 8904Ned Flanders636-555-3226Simpson, Homer5553642Dr. Julius Hibbert"
name <- unlist(str_extract_all(raw.data, "[[:alpha:]., ]{2,}"))
name
## [1] "Moe Szyslak" "Burns, C. Montgomery" "Rev. Timothy Lovejoy"
## [4] "Ned Flanders" "Simpson, Homer" "Dr. Julius Hibbert"
first_second <- str_split(name, ", ")
first_second <- sapply(first_second, function(x) str_replace(paste(x[2], x[1]),"^NA ",""))
title_regex <- "^(Mr|Mrs|Dr|Rev)\\."
str_replace(first_second, title_regex, "")
## [1] "Moe Szyslak" "C. Montgomery Burns" " Timothy Lovejoy"
## [4] "Ned Flanders" "Homer Simpson" " Julius Hibbert"
str_detect(first_second, title_regex)
## [1] FALSE FALSE TRUE FALSE FALSE TRUE
str_detect(first_second, "[:blank:][:alpha:].[:blank:]")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
Describe the types of strings that conform to the following regular expressions and construct an example that is matched by the regular expression.
samples <- c('123$', '456$', 8900)
str_match(samples, "[0-9]+\\$")
## [,1]
## [1,] "123$"
## [2,] "456$"
## [3,] NA
samples <- c('Here we go', 'Oye', 8900, 'I will not forgive you Mr. 500.')
str_match(samples, "\\b[a-z]{1,4}\\b")
## [,1]
## [1,] "we"
## [2,] NA
## [3,] NA
## [4,] "will"
samples <- c('abc.csv', 'Tex.txt', '8900.txt', 7200, 'Wer$234.txt')
str_match(samples, ".*?\\.txt$")
## [,1]
## [1,] NA
## [2,] "Tex.txt"
## [3,] "8900.txt"
## [4,] NA
## [5,] "Wer$234.txt"
samples <- c('22/12/2008', 'Tex.txt', '8900/12/12', '12/1/7200', '90/90/9000')
str_match(samples, "\\d{2}/\\d{2}/\\d{4}")
## [,1]
## [1,] "22/12/2008"
## [2,] NA
## [3,] NA
## [4,] NA
## [5,] "90/90/9000"
The following code hides a secret message. Crack it with R and regular expressions. Hint: Some of the characters are more revealing than others! The code snippet is also available in the materials at www.r-datacollection.com. clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanwo Uwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigO d6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5 fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr
raw.data1 <- "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
sort(table(str_extract_all(raw.data1, ".")), decreasing=TRUE)[1:10]
##
## c 5 o w g n f j r t
## 12 11 11 11 9 9 8 8 8 8
str_extract_all(raw.data1, "c...c")
## [[1]]
## character(0)
str_extract_all(raw.data1, "5...5")
## [[1]]
## character(0)
str_extract_all(raw.data1, "o...o")
## [[1]]
## character(0)
str_extract_all(raw.data1, "w...w")
## [[1]]
## [1] "w1Yww"
str_extract_all(raw.data1, "g...g")
## [[1]]
## character(0)
str_extract_all(raw.data1, "n...n")
## [[1]]
## [1] "n0Tan"
str_extract_all(raw.data1, "f...f")
## [[1]]
## character(0)
str_extract_all(raw.data1, "j...j")
## [[1]]
## character(0)
str_extract_all(raw.data1, "r...r")
## [[1]]
## character(0)
str_extract_all(raw.data1, "t...t")
## [[1]]
## [1] "tj55t"
#space_vowels_period <- c("c","5","o","w","g","n","f","j","r","t")
#str_replace_all(raw.data1, space_vowels_period, "\ ")
str_replace_all(raw.data1,"n", "\ ")
## [1] "clcopCow1zmstc0d87w kig7OvdicpNuggvhry 92Gjuwczi8hqrfpRxs5Aj5dwp 0Ta woUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3 e6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkA bhzgv4R9i05zEcrop.wAg b.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89 6Nd5t9kc4fE905gmc4Rgxo5 hDk!gr"
str_replace_all(raw.data1,"t", "\ ")
## [1] "clcopCow1zms c0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0b 7yczja Oaoo j55 3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1o fb7wEm24k6 3sR9zqe5fy89n6Nd5 9kc4fE905gmc4Rgxo5nhDk!gr"
str_replace_all(raw.data1,"j", "\ ")
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92G uwczi8hqrfpRxs5A 5dwpn0TanwoUwisdi 7L 8kpf03AT5Idr3coc0bt7ycz atOaoot 55t3N 3ne6c4Sfek.r1w1Ywwo igOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
raw.data1
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_replace_all(raw.data1,"5", "\ ")
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs Aj dwpn0TanwoUwisdij7Lj8kpf03AT Idr3coc0bt7yczjatOaootj t3Nj3ne6c4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i0 zEcrop.wAgnb.SqoU6 fPa1otfb7wEm24k6t3sR9zqe fy89n6Nd t9kc4fE90 gmc4Rgxo nhDk!gr"
str_replace_all(raw.data1,"o", "\ ")
## [1] "clc pC w1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0Tanw Uwisdij7Lj8kpf03AT5Idr3c c0bt7yczjatOa tj55t3Nj3ne6c4Sfek.r1w1Yww jigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcr p.wAgnb.Sq U65fPa1 tfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgx 5nhDk!gr"
str_replace_all(raw.data1,"w", "\ ")
## [1] "clcopCo 1zmstc0d87 nkig7OvdicpNuggvhryn92Gju czi8hqrfpRxs5Aj5d pn0Tan oU isdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1 1Y ojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zEcrop. Agnb.SqoU65fPa1otfb7 Em24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!gr"
str_replace_all(raw.data1,"g", "\ ")
## [1] "clcopCow1zmstc0d87wnki 7OvdicpNu vhryn92Gjuwczi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek.r1w1Ywwoji Od6vrfUrbz2.2bkAnbhz v4R9i05zEcrop.wA nb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905 mc4R xo5nhDk! r"
str_replace_all(raw.data1,"f", "\ ")
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvhryn92Gjuwczi8hqr pRxs5Aj5dwpn0TanwoUwisdij7Lj8kp 03AT5Idr3coc0bt7yczjatOaootj55t3Nj3ne6c4S ek.r1w1YwwojigOd6vr Urbz2.2bkAnbhzgv4R9i05zEcrop.wAgnb.SqoU65 Pa1ot b7wEm24k6t3sR9zqe5 y89n6Nd5t9kc4 E905gmc4Rgxo5nhDk!gr"
str_replace_all(raw.data1,"r", "\ ")
## [1] "clcopCow1zmstc0d87wnkig7OvdicpNuggvh yn92Gjuwczi8hq fpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Id 3coc0bt7yczjatOaootj55t3Nj3ne6c4Sfek. 1w1YwwojigOd6v fU bz2.2bkAnbhzgv4R9i05zEc op.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9kc4fE905gmc4Rgxo5nhDk!g "
str_replace_all(raw.data1,"c", "\ ")
## [1] " l opCow1zmst 0d87wnkig7Ovdi pNuggvhryn92Gjuw zi8hqrfpRxs5Aj5dwpn0TanwoUwisdij7Lj8kpf03AT5Idr3 o 0bt7y zjatOaootj55t3Nj3ne6 4Sfek.r1w1YwwojigOd6vrfUrbz2.2bkAnbhzgv4R9i05zE rop.wAgnb.SqoU65fPa1otfb7wEm24k6t3sR9zqe5fy89n6Nd5t9k 4fE905gm 4Rgxo5nhDk!gr"
sort(table(str_extract_all(raw.data1, ".{2}")), decreasing=TRUE)
##
## co ot oU .2 05 0d 1z 24 2G 3A 3N 4f 4S 5d 5f 5n 5t 5z 65 6c 6N 7L 7w 7y 87
## 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 8h 9n 9z a1 Aj An ao b. bh bk bt c0 c4 cl cp cz d5 d6 di E9 Ec Em f0 fb fe
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## fp fP fU g7 gg gm gn gO gr hD i0 Id ij j3 j5 j8 ja ji ju k! k. k6 kc ki kp
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## ms n0 n9 ne Nu nw Ov ow p. pC qe qr r1 r3 R9 rb Rg ro Rx ry s5 sd Sq sR t3
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## T5 t9 Ta tc tO v4 vh vr w1 wA wc wi wn wo wp xo y8 Yw z2 zg zi
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
sort(table(str_extract_all(raw.data1, ".{3}")), decreasing=TRUE)
##
## .r1 2bk 3co 4k6 5Aj 5dw 5gm 5nh 5t3 5zE 65f 7Lj 8hq 8kp 92G 9i0 9n6 Agn
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Anb aoo AT5 atO b.S b7w c0b c4f c4R c4S clc cro czi czj d87 dij Dk! E90
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Em2 f03 fek fy8 gvh gxo hzg icp Idr ig7 jig juw Nd5 ne6 Nj3 Nug Od6 opC
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## otf Ovd ow1 p.w Pa1 pn0 qe5 qoU R9z rfp Rxs ryn t3s t7y t9k Tan tc0 tj5
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## Urb v4R vrf w1Y wis wnk woU wwo z2. zms
## 1 1 1 1 1 1 1 1 1 1
str_extract_all(raw.data1, "c..c")
## [[1]]
## character(0)
str_extract_all(raw.data1, "5..5")
## [[1]]
## [1] "5Aj5"
str_extract_all(raw.data1, "o..o")
## [[1]]
## [1] "opCo"
str_extract_all(raw.data1, "w..w")
## [[1]]
## [1] "woUw" "w1Yw"
str_extract_all(raw.data1, "g..g")
## [[1]]
## character(0)
str_extract_all(raw.data1, "n..n")
## [[1]]
## character(0)
str_extract_all(raw.data1, "f..f")
## [[1]]
## character(0)
str_extract_all(raw.data1, "j..j")
## [[1]]
## [1] "j7Lj"
str_extract_all(raw.data1, "r..r")
## [[1]]
## [1] "rfUr"
str_extract_all(raw.data1, "t..t")
## [[1]]
## character(0)